-
-
Notifications
You must be signed in to change notification settings - Fork 604
Open
Description
显卡:nvidia l40
测试代码:
#define SIZE 1024
__global__ void test_shfl_sync(int* x, int N){
int tid = threadIdx.x;
int val = tid ;
x[tid] = __shfl_sync(0xffffffff, val, 0);
__syncwarp();
}
int main(){
int* x = (int *)malloc(sizeof(int) * SIZE);
int* y = (int *)malloc(sizeof(int) * SIZE);
for(int i = 0; i < SIZE; i++){
x[i] = i;
y[i] = 0;
}
int *d_x1;
cudaMalloc(&d_x1, sizeof(int) * SIZE);
cudaMemcpy(d_x1, x, sizeof(int) * SIZE, cudaMemcpyHostToDevice);
int THREAD_NUM_PER_BLOCK = 64;
int block_num = (SIZE + THREAD_NUM_PER_BLOCK - 1) / THREAD_NUM_PER_BLOCK;
test_shfl_sync<<<block_num, THREAD_NUM_PER_BLOCK>>>(d_x1, SIZE);
cudaDeviceSynchronize();
cudaMemcpy(y, d_x1, sizeof(int) * SIZE, cudaMemcpyDeviceToHost);
for(int i = 0; i < 64; i++){
printf("%d ", y[i]);
}
printf("\n");
}
打印是:0 0 ... 0 32 32 ... 32
, 这个如预期,然后我修改mask为 0x0000ffff
,
__global__ void test_shfl_sync(int* x, int N){
int tid = threadIdx.x;
int val = tid ;
x[tid] = __shfl_sync(0x0000ffff, val, 0);
__syncwarp();
}
打印还是:0 0 ... 0 32 32 ... 32
, 这就搞不懂了
Metadata
Metadata
Assignees
Labels
No labels