Skip to content

Commit 055dbbc

Browse files
spcypptmeta-codesync[bot]
authored andcommitted
Improve GIS error check (#5195)
Summary: Pull Request resolved: #5195 X-link: https://github.com/facebookresearch/FBGEMM/pull/2192 as titled. Reviewed By: q10 Differential Revision: D88234100 fbshipit-source-id: 51fe955f26cf6cf27212bfef12f398a28f48aa65
1 parent dddb378 commit 055dbbc

File tree

1 file changed

+59
-15
lines changed

1 file changed

+59
-15
lines changed

fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ class IndexSelectDim0GPUOp
123123
const bool skip_indices_sorting_fwd) {
124124
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(input, indices);
125125
// Expect a 1D index tensor
126-
TORCH_CHECK(indices.dim() == 1, "Index tensor must be 1D")
126+
TORCH_CHECK(
127+
indices.dim() == 1, "Index tensor must be 1D, but got ", indices.dim());
127128

128129
Tensor sorted_indices, orig_indices;
129130
if (skip_indices_sorting_fwd) {
@@ -149,7 +150,10 @@ class IndexSelectDim0GPUOp
149150
static torch::autograd::variable_list backward(
150151
torch::autograd::AutogradContext* ctx,
151152
torch::autograd::variable_list grad_outputs) {
152-
TORCH_CHECK(grad_outputs.size() == 1);
153+
TORCH_CHECK(
154+
grad_outputs.size() == 1,
155+
"The size of grad_outputs should be 1, but got ",
156+
grad_outputs.size());
153157
TENSOR_ON_CUDA_GPU(grad_outputs[0]);
154158

155159
bool skip_indices_sorting_fwd =
@@ -237,7 +241,8 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
237241
at::TensorOptions().dtype(at::kByte).pinned_memory(true));
238242

239243
// Ensure that args_tensor is contiguous
240-
TORCH_CHECK(args_tensor.is_contiguous());
244+
TORCH_CHECK(
245+
args_tensor.is_contiguous(), "Tensor args_tensor must be contiguous.");
241246

242247
// Initialize raw pointers to point to Tensor args_tensor
243248
int64_t* input_ptrs = nullptr;
@@ -288,7 +293,14 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
288293
// Verify that all input tensors have the same number of dimensions
289294
TORCH_CHECK(
290295
input_dim == input.dim(),
291-
"All inputs in group_index_select must have the same number of dimensions");
296+
"All inputs in group_index_select must have the same number of dimensions. Expect ",
297+
input_dim,
298+
" but got group ",
299+
i,
300+
" with ",
301+
input.dim(),
302+
". Group size is ",
303+
group_size);
292304

293305
// Verify that all tensors are on the same GPU
294306
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(input, indices);
@@ -298,7 +310,14 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
298310
// Verify that all input tensors have the same shape[0]
299311
TORCH_CHECK(
300312
num_output_rows == num_output_rows_,
301-
"The number of indices to be selected must be the same for the entire group");
313+
"The number of indices to be selected must be the same for the entire group of ",
314+
group_size,
315+
". Expect indices size to be ",
316+
num_output_rows,
317+
", but got group ",
318+
i,
319+
" with indices size of ",
320+
num_output_rows_);
302321
const auto input_reshaped_ = input.reshape({input.size(0), -1});
303322

304323
// Number of columns can be different
@@ -314,7 +333,7 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
314333
input_shape[0] = num_output_rows_;
315334
Tensor output = at::empty(input_shape, input.options());
316335
// Ensure that the allocated output is contiguous
317-
TORCH_CHECK(output.is_contiguous())
336+
TORCH_CHECK(output.is_contiguous(), "output tensor must be contiguous.");
318337
output_group.push_back(output);
319338

320339
// Store input and indices contigs to keep them alive during the kernel
@@ -360,7 +379,8 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
360379
auto saved_data_t = at::empty(
361380
{sizeof(saved_data) / sizeof(int64_t)},
362381
at::TensorOptions().dtype(at::kLong));
363-
TORCH_CHECK(saved_data_t.is_contiguous());
382+
TORCH_CHECK(
383+
saved_data_t.is_contiguous(), "Tensor saved_data_t must be contiguous.");
364384
memcpy(saved_data_t.data_ptr<int64_t>(), saved_data, sizeof(saved_data));
365385

366386
group_index_select_or_add_cuda(
@@ -389,7 +409,10 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
389409
static torch::autograd::variable_list group_index_select_dim0_backward_impl_gpu(
390410
at::TensorList all_inputs,
391411
c10::SymIntArrayRef output_shape_group_ref) {
392-
TORCH_CHECK(all_inputs.size() > 2);
412+
TORCH_CHECK(
413+
all_inputs.size() > 2,
414+
"all_inputs size must be larger than 2, but got ",
415+
all_inputs.size());
393416

394417
// all_input size = group_size * 2 (from grads, indices)
395418
// + 1 args_tensor + 1 saved_data + 1 first input
@@ -412,11 +435,18 @@ static torch::autograd::variable_list group_index_select_dim0_backward_impl_gpu(
412435
all_inputs.cbegin() + group_size, all_inputs.cbegin() + 2 * group_size);
413436

414437
// Retrieve saved data
415-
TORCH_CHECK(saved_data.device() == at::kCPU);
416-
TORCH_CHECK(saved_data.is_contiguous());
438+
TORCH_CHECK(
439+
saved_data.device() == at::kCPU, "Tensor saved_data must be on CPU.");
440+
TORCH_CHECK(
441+
saved_data.is_contiguous(), "Tensor saved_data must be contiguous.");
417442
int64_t* saved_data_ptr = saved_data.data_ptr<int64_t>();
418443
// Check that the size is the same
419-
TORCH_CHECK(saved_data_ptr[0] == group_size);
444+
TORCH_CHECK(
445+
saved_data_ptr[0] == group_size,
446+
"The size of saved_data[0] must match group_size. Expect ",
447+
group_size,
448+
" but got ",
449+
saved_data_ptr[0]);
420450
const bool use_var_cols = saved_data_ptr[1];
421451
int64_t* warp_offsets_group = reinterpret_cast<int64_t*>(saved_data_ptr[2]);
422452
int32_t* num_cols_group = reinterpret_cast<int32_t*>(saved_data_ptr[3]);
@@ -448,7 +478,8 @@ static torch::autograd::variable_list group_index_select_dim0_backward_impl_gpu(
448478
{group_size * 3},
449479
at::TensorOptions().dtype(at::kLong).pinned_memory(true));
450480
// Ensure that args_tensor is contiguous
451-
TORCH_CHECK(args_tensor.is_contiguous());
481+
TORCH_CHECK(
482+
args_tensor.is_contiguous(), "Tensor args_tensor must be contiguous.");
452483
int64_t* grad_output_ptrs = args_tensor.data_ptr<int64_t>();
453484
int64_t* grad_input_ptrs = args_tensor.data_ptr<int64_t>() + group_size;
454485
int64_t* indices_ptrs = args_tensor.data_ptr<int64_t>() + 2 * group_size;
@@ -485,20 +516,33 @@ static torch::autograd::variable_list group_index_select_dim0_backward_impl_gpu(
485516
// Allocate a big tensor to avoid calling many small elementwise kernels
486517
const auto group_grad_input =
487518
at::zeros({group_grad_input_numel}, fwd_input.options());
488-
TORCH_CHECK(group_grad_input.is_contiguous());
519+
TORCH_CHECK(
520+
group_grad_input.is_contiguous(),
521+
"Tensor group_grad_input must be contiguous.");
489522

490523
// Split to output_group
491524
auto output_group = group_grad_input.split(grad_input_numels, 0);
492525

493-
TORCH_CHECK(output_group.size() == static_cast<size_t>(group_size));
526+
TORCH_CHECK(
527+
output_group.size() == static_cast<size_t>(group_size),
528+
"output_group size must be ",
529+
group_size,
530+
" but got ",
531+
output_group.size());
494532

495533
// Reshape grad inputs and obtain their pointers
496534
for (int i = 0; i < group_size; i++) {
497535
const auto grad_input_shape = std::vector<int64_t>(
498536
output_shape_group.begin() + i * output_dim,
499537
output_shape_group.begin() + (i + 1) * output_dim);
500538
output_group[i] = output_group[i].reshape(grad_input_shape);
501-
TORCH_CHECK(output_group[i].is_contiguous());
539+
TORCH_CHECK(
540+
output_group[i].is_contiguous(),
541+
"Tensor output_group ",
542+
i,
543+
" of ",
544+
group_size,
545+
" must be contiguous.");
502546
grad_input_ptrs[i] = reinterpret_cast<int64_t>(output_group[i].data_ptr());
503547

504548
// 2) Add group_size gradients for inputs

0 commit comments

Comments
 (0)