Skip to content

allgather causes SEGFAULT #56

@Iain-S

Description

@Iain-S

Summary

Calling torch.distributed.all_gather() when using the 'ccl' backend results in a SEGFAULT if the tensors being gathered are larger than a few megabytes.

This problem also seems to occur with gather().

Steps to Reproduce

See my minimal reproducible example repo here: https://github.com/Iain-S/torch-ccl-segfault/tree/main

Using a tensor of around 11MiB is enough to cause a segfault.

Expected Behaviour

I would not expect a SEGFAULT to be raised.

Actual Behaviour

I get the following output

Caught signal 11 (Segmentation fault: address not mapped to object at address 0xff00000002600000)

LIBXSMM_VERSION: main_stable-1.17-3651 (25693763)==== backtrace (tid:  43554) ====
 0 0x0000000000012cf0 __funlockfile()  :0
 1 0x00000000000cee73 __memmove_avx_unaligned_erms()  :0
 2 0x00000000004b80c3 zeKernelSetIndirectAccessTracing()  ???:0
 3 0x0000000000106cb8 zetGetMetricGroupExpProcAddrTable()  ???:0
 4 0x00000000000fc2da zetGetMetricGroupExpProcAddrTable()  ???:0
 5 0x00000000001967a4 zetGetMetricGroupExpProcAddrTable()  ???:0
 6 0x00000000001a5aa8 zetGetMetricGroupExpProcAddrTable()  ???:0
 7 0x00000000000f1f34 ???()  /lib64/libze_intel_gpu.so.1:0
 8 0x0000000000046103 std::vector<void*, std::allocator<void*> >::resize()  ???:0
 9 0x0000000000012975 zeGetFabricVertexExpProcAddrTable()  ???:0
10 0x000000000052cce9 ze_cmd_memory_copy::ze_call()  :0
11 0x000000000052d933 ze_copy_entry::init_ze_hook()  :0
12 0x000000000050d14d ze_base_entry::init()  :0
13 0x000000000050df3a ze_base_entry::init_entries()  :0
14 0x000000000050e25f ze_base_entry::start()  :0
15 0x0000000000470578 sched_entry::do_progress()  :0
16 0x0000000000482985 ccl_sched::do_progress()  :0
17 0x00000000003fe969 ccl_worker::process_sched_bin()  :0
18 0x00000000003fe540 ccl_worker::process_sched_queue()  :0
19 0x00000000003fd31b ccl_worker::do_work()  :0
20 0x00000000003f84bd ccl_executor::wait()  :0
21 0x00000000003061fb ccl_coll_create()  coll-f53f59.cpp:0
22 0x00000000003056a6 ccl_allgatherv_impl()  :0
23 0x000000000035aa1e ccl_comm::allgatherv_impl()  :0
24 0x000000000036d5cb ccl_comm::allgatherv()  :0
25 0x00000000004c4be1 ccl::v1::allgatherv()  ???:0
26 0x0000000000035fdd oneccl_bindings_for_pytorch::CollectiveAsyncWorkCCL<oneccl_bindings_for_pytorch::XPUCCLStubs::allgather_(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&, c10d::ProcessGroupCCL&)::{lambda(at::Tensor, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, ccl::v1::allgatherv_attr, c
31 0x0000000000030aee c10d::ProcessGroupCCL::allgather()  ???:0
32 0x000000000002a3fa c10d::ops::allgather_xpu_()  ???:0
33 0x000000000003cecf c10::impl::make_boxed_from_unboxed_functor<c10::impl::detail::WrapFunctionIntoRuntimeFunctor_<std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > > (*)(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long), std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > >, c10::guts::typelist::typelist<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long> >, false>::call()  ???:0
34 0x0000000004c8d308 c10::impl::BoxedKernelWrapper<std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > > (std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long), void>::call()  :0
35 0x0000000004c98b45 c10d::ProcessGroup::allgather()  :0
36 0x0000000000b7e8b0 pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> >, c10d::ProcessGroup, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&, pybind11::name, pybind11::is_method, pybind11::sibling, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::call_guard<pybind11::gil_scoped_release> >(c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > (c10d::ProcessGroup::*)(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::alloca
::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::T
ensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::call_guard<pybind11::gil_scoped_release> const&)::{lambda(c10d::ProcessGroup*, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&)#1}&&, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > (*)(c10d::ProcessGroup*, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::call_guard<pybind11::gil_scoped_release> const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN()  :0
37 0x00000000003975c7 pybind11::cpp_function::dispatcher()  :0
38 0x00000000001639fb PyCFunction_Call()  ???:0
39 0x000000000019046b _PyObject_MakeTpCall()  ???:0
40 0x00000000001c8c17 PyEval_EvalCodeEx()  ???:0
41 0x000000000020bcb1 _PyEval_EvalFrameDefault()  ???:0
42 0x00000000001cb465 _PyFunction_Vectorcall()  ???:0
43 0x00000000001638bb PyObject_Call()  ???:0
44 0x0000000000209030 _PyEval_EvalFrameDefault()  ???:0
45 0x00000000001cb465 _PyFunction_Vectorcall()  ???:0
46 0x000000000020bcb1 _PyEval_EvalFrameDefault()  ???:0
47 0x00000000001cb465 _PyFunction_Vectorcall()  ???:0
48 0x0000000000206f4d _PyEval_EvalFrameDefault()  ???:0
49 0x00000000001c7713 PyList_SetSlice()  ???:0
50 0x00000000001c870f _PyEval_EvalCodeWithName()  ???:0
51 0x00000000001c8743 PyEval_EvalCode()  ???:0
52 0x0000000000279dad _PyImport_FixupBuiltin()  ???:0
53 0x000000000028db0a PyAST_CompileObject()  ???:0
54 0x000000000011d2f6 PyRun_String()  ???:0
55 0x000000000028e325 PyRun_SimpleFileExFlags()  ???:0
56 0x000000000028e7d2 Py_RunMain()  ???:0
57 0x000000000028e919 Py_BytesMain()  ???:0

Versions

  • Python 3.9
  • For Python package versions, see README in my repo.
  • CCL v2021.11.2
  • Running on an Intel(R) Data Center GPU Max 1550

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions