Skip to content

fib_par.ml example reports TSAN violation #132

@dijkstracula

Description

@dijkstracula

Dear domainslib maintainers,

While working on a repro case for an unrelated data race, I observed that the fib_par.ml example in the README exhibits a TSAN data race!

(cli) ntaylor@smgrp ~/c/p/re_fun> opam switch
#  switch                compiler                                           description
→  semgrep-tsan          ocaml-option-tsan.1,ocaml-variants.5.3.0+options
          ocaml-variants = 5.3.0+options & ocaml-option-tsan
module T = Domainslib.Task

let num_domains = 4
let n = 10

(* Sequential Fibonacci *)
let rec fib n =
  if n < 2 then 1 else fib (n - 1) + fib (n - 2)

let rec fib_par pool n =
  if n > 20 then begin
    let a = T.async pool (fun _ -> fib_par pool (n-1)) in
    let b = T.async pool (fun _ -> fib_par pool (n-2)) in
    T.await pool a + T.await pool b
  end else
    (* Call sequential Fibonacci if the available work is small *)
    fib n

let main () =
  let pool = T.setup_pool ~num_domains:(num_domains - 1) () in
  let res = T.run pool (fun _ -> fib_par pool n) in
  T.teardown_pool pool;
  Printf.printf "fib(%d) = %d\n" n res

let _ = main ()
(cli) ntaylor@smgrp ~/c/p/re_fun> export TSAN_OPTIONS="exitcode=0 history_size=7"
(cli) ntaylor@smgrp ~/c/p/re_fun> dune exec --profile tsan re_fun
[...]
==================
WARNING: ThreadSanitizer: data race (pid=79645)
  Read of size 8 at 0x000100e1b908 by thread T4 (mutexes: write M0):
    #0 camlSaturn__Ws_deque.steal_as_578 <null> (main.exe:arm64+0x100011798)
    #1 camlDomainslib__Multi_channel.recv_poll_loop_648 <null> (main.exe:arm64+0x100006e74)
    #2 camlDomainslib__Multi_channel.recv_poll_repeated_665 <null> (main.exe:arm64+0x100007684)
    #3 camlDomainslib__Multi_channel.recv_669 <null> (main.exe:arm64+0x10000791c)
    #4 camlDomainslib__Task.worker_628 <null> (main.exe:arm64+0x1000091f8)
    #5 camlStdlib__Fun.protect_326 <null> (main.exe:arm64+0x100099b94)
    #6 camlStdlib__Domain.body_741 <null> (main.exe:arm64+0x100071820)
    #7 caml_start_program <null> (main.exe:arm64+0x100106000)
    #8 caml_callback_exn callback.c:206 (main.exe:arm64+0x1000c0b64)
    #9 caml_callback_res callback.c:321 (main.exe:arm64+0x1000c1444)
    #10 domain_thread_func domain.c:1245 (main.exe:arm64+0x1000c4f88)

  Previous atomic write of size 8 at 0x000100e1b908 by thread T6 (mutexes: write M1):
    #0 caml_atomic_cas memory.c:388 (main.exe:arm64+0x1000e87ac)
    #1 camlSaturn__Ws_deque.steal_as_578 <null> (main.exe:arm64+0x100011860)
    #2 camlDomainslib__Multi_channel.recv_poll_loop_648 <null> (main.exe:arm64+0x100006e74)
    #3 camlDomainslib__Multi_channel.recv_poll_repeated_665 <null> (main.exe:arm64+0x100007684)
    #4 camlDomainslib__Multi_channel.recv_669 <null> (main.exe:arm64+0x10000791c)
    #5 camlDomainslib__Task.worker_628 <null> (main.exe:arm64+0x1000091f8)
    #6 camlStdlib__Fun.protect_326 <null> (main.exe:arm64+0x100099b94)
    #7 camlStdlib__Domain.body_741 <null> (main.exe:arm64+0x100071820)
    #8 caml_start_program <null> (main.exe:arm64+0x100106000)
    #9 caml_callback_exn callback.c:206 (main.exe:arm64+0x1000c0b64)
    #10 caml_callback_res callback.c:321 (main.exe:arm64+0x1000c1444)
    #11 domain_thread_func domain.c:1245 (main.exe:arm64+0x1000c4f88)

  Mutex M0 (0x000105600328) created at:
    #0 pthread_mutex_init <null> (libclang_rt.tsan_osx_dynamic.dylib:arm64e+0x34594)
    #1 caml_plat_mutex_init platform.c:59 (main.exe:arm64+0x1000f27bc)
    #2 caml_init_domains domain.c:974 (main.exe:arm64+0x1000c43fc)
    #3 caml_init_gc gc_ctrl.c:348 (main.exe:arm64+0x1000d4168)
    #4 caml_startup_common startup_nat.c:106 (main.exe:arm64+0x100104c98)
    #5 caml_main startup_nat.c:146 (main.exe:arm64+0x100104e68)
    #6 main main.c:37 (main.exe:arm64+0x1000e251c)
 [...]

SUMMARY: ThreadSanitizer: data race (main.exe:arm64+0x100011798) in camlSaturn__Ws_deque.steal_as_578+0x68
==================
fib(10) = 89
ThreadSanitizer: reported 1 warnings
ThreadSanitizer: reported 1 warnings

Technically, I can trigger this with a far simpler repro case:

(cli) ntaylor@smgrp ~/c/p/re_fun> cat bin/main.ml
module T = Domainslib.Task

let () =
    let pool = T.setup_pool ~num_domains:4 () in
    Printf.printf "%d\n" (T.run pool (fun () -> 42));
    T.teardown_pool pool;
(cli) ntaylor@smgrp ~/c/p/re_fun>

This appears to be similar to ocaml-multicore/saturn#39 and perhaps this issue should be filed with the Saturn maintainers too - however, the Saturn issue was closed without a fix.

It's a frictive experience for TSAN warnings in libraries, even if they are de-facto benign, to be emitted in application code, and given how simple the repro case is, one imagines every domainslib program will report a race under TSAN, so IMHO a fix to squelch this issue (by removing the race altogether or some sort of annotation for TSAN) should be reconsidered.

Thanks for your consideration,
Nathan

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions