Skip to content

Commit 66748e1

Browse files
committed
proper dispatch the kernel with atomic_add but can only in runtime
1 parent 47d2b13 commit 66748e1

File tree

4 files changed

+81
-9
lines changed

4 files changed

+81
-9
lines changed

common/cuda_hip/matrix/coo_kernels.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,10 +271,20 @@ void spmv2(std::shared_ptr<const DefaultExecutor> exec,
271271
return;
272272
}
273273
// not support 16 bit atomic
274-
#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
274+
#if !defined(CUDA_VERSION)
275275
if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
276276
GKO_NOT_SUPPORTED(c);
277277
} else
278+
#else
279+
auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
280+
as<CudaExecutor>(exec)->get_minor_version();
281+
if (compute_capability < 70 &&
282+
std::is_same_v<remove_complex<ValueType>, half>) {
283+
GKO_NOT_SUPPORTED(c);
284+
} else if (compute_capability < 80 &&
285+
std::is_same_v<remove_complex<ValueType>, bfloat16>) {
286+
GKO_NOT_SUPPORTED(c);
287+
} else
278288
#endif
279289
{
280290
// TODO: b_ncols needs to be tuned for ROCm.
@@ -323,10 +333,20 @@ void advanced_spmv2(std::shared_ptr<const DefaultExecutor> exec,
323333
return;
324334
}
325335
// not support 16 bit atomic
326-
#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
336+
#if !defined(CUDA_VERSION)
327337
if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
328338
GKO_NOT_SUPPORTED(c);
329339
} else
340+
#else
341+
auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
342+
as<CudaExecutor>(exec)->get_minor_version();
343+
if (compute_capability < 70 &&
344+
std::is_same_v<remove_complex<ValueType>, half>) {
345+
GKO_NOT_SUPPORTED(c);
346+
} else if (compute_capability < 80 &&
347+
std::is_same_v<remove_complex<ValueType>, bfloat16>) {
348+
GKO_NOT_SUPPORTED(c);
349+
} else
330350
#endif
331351
{
332352
// TODO: b_ncols needs to be tuned for ROCm.

common/cuda_hip/matrix/csr_kernels.template.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2091,11 +2091,21 @@ bool load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
20912091
using arithmetic_type =
20922092
highest_precision<InputValueType, OutputValueType, MatrixValueType>;
20932093

2094-
// not support 16 bit atomic
2095-
#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
2094+
// not support 16 bit atomic
2095+
#if !defined(CUDA_VERSION)
20962096
if constexpr (sizeof(remove_complex<OutputValueType>) == sizeof(int16)) {
20972097
return false;
20982098
} else
2099+
#else
2100+
auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
2101+
as<CudaExecutor>(exec)->get_minor_version();
2102+
if (compute_capability < 70 &&
2103+
std::is_same_v<remove_complex<OutputValueType>, half>) {
2104+
return false;
2105+
} else if (compute_capability < 80 &&
2106+
std::is_same_v<remove_complex<OutputValueType>, bfloat16>) {
2107+
return false;
2108+
} else
20992109
#endif
21002110
{
21012111
if (beta) {

common/cuda_hip/matrix/ell_kernels.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ void abstract_spmv(syn::value_list<int, info>,
252252
b->get_size()[1], 1);
253253

254254
// not support 16 bit atomic
255-
#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
255+
#if !defined(CUDA_VERSION)
256256
// We do atomic on shared memory when num_thread_per_worker is not 1.
257257
// If atomic is also true, we also do atomic on out_vector.
258258
constexpr bool shared_half =
@@ -263,6 +263,25 @@ void abstract_spmv(syn::value_list<int, info>,
263263
(shared_half || atomic_half_out)) {
264264
GKO_KERNEL_NOT_FOUND;
265265
} else
266+
#else
267+
constexpr bool shared_half =
268+
sizeof(remove_complex<arithmetic_type>) == sizeof(half);
269+
constexpr bool atomic_half_out =
270+
atomic && sizeof(remove_complex<OutputValueType>) == sizeof(half);
271+
constexpr bool shared_bfloat16 =
272+
sizeof(remove_complex<arithmetic_type>) == sizeof(bfloat16);
273+
constexpr bool atomic_bfloat16_out =
274+
atomic && sizeof(remove_complex<OutputValueType>) == sizeof(bfloat16);
275+
auto compute_capability = as<CudaExecutor>(exec)->get_major_version() * 10 +
276+
as<CudaExecutor>(exec)->get_minor_version();
277+
if (num_thread_per_worker != 1 && (shared_half || atomic_half_out) &&
278+
compute_capability < 70) {
279+
GKO_KERNEL_NOT_FOUND;
280+
} else if (num_thread_per_worker != 1 &&
281+
(shared_bfloat16 || atomic_bfloat16_out) &&
282+
compute_capability < 80) {
283+
GKO_KERNEL_NOT_FOUND;
284+
} else
266285
#endif
267286
{
268287
const auto a_vals = acc::range<a_accessor>(

common/cuda_hip/solver/idr_kernels.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,11 +454,22 @@ void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
454454
if (nrhs > 1 || is_complex<ValueType>()) {
455455
components::fill_array(exec, alpha->get_values(), nrhs,
456456
zero<ValueType>());
457-
// not support 16 bit atomic
458-
#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
457+
// not support 16 bit atomic
458+
#if !defined(CUDA_VERSION)
459459
if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
460460
GKO_NOT_SUPPORTED(alpha);
461461
} else
462+
#else
463+
auto compute_capability =
464+
as<CudaExecutor>(exec)->get_major_version() * 10 +
465+
as<CudaExecutor>(exec)->get_minor_version();
466+
if (compute_capability < 70 &&
467+
std::is_same_v<remove_complex<ValueType>, half>) {
468+
GKO_NOT_SUPPORTED(alpha);
469+
} else if (compute_capability < 80 &&
470+
std::is_same_v<remove_complex<ValueType>, bfloat16>) {
471+
GKO_NOT_SUPPORTED(alpha);
472+
} else
462473
#endif
463474
{
464475
multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
@@ -513,11 +524,23 @@ void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
513524
auto m_i = m->get_values() + i * m_stride + k * nrhs;
514525
if (nrhs > 1 || is_complex<ValueType>()) {
515526
components::fill_array(exec, m_i, nrhs, zero<ValueType>());
516-
// not support 16 bit atomic
517-
#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700))
527+
528+
// not support 16 bit atomic
529+
#if !defined(CUDA_VERSION)
518530
if constexpr (sizeof(remove_complex<ValueType>) == sizeof(int16)) {
519531
GKO_NOT_SUPPORTED(m_i);
520532
} else
533+
#else
534+
auto compute_capability =
535+
as<CudaExecutor>(exec)->get_major_version() * 10 +
536+
as<CudaExecutor>(exec)->get_minor_version();
537+
if (compute_capability < 70 &&
538+
std::is_same_v<remove_complex<ValueType>, half>) {
539+
GKO_NOT_SUPPORTED(m_i);
540+
} else if (compute_capability < 80 &&
541+
std::is_same_v<remove_complex<ValueType>, bfloat16>) {
542+
GKO_NOT_SUPPORTED(m_i);
543+
} else
521544
#endif
522545
{
523546
multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(

0 commit comments

Comments
 (0)