Skip to content

Commit e424d43

Browse files
committed
Potential fix
Signed-off-by: Michael Acar <michael.j.acar@gmail.com>
1 parent 1053dfc commit e424d43

File tree

1 file changed

+44
-20
lines changed

1 file changed

+44
-20
lines changed

src/ray/util/process.cc

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ class ProcessFD {
235235
pid = fork();
236236

237237
if (pid == 0) {
238-
// Child process (or intermediate process if decoupled).
238+
// --- Child Process (or Intermediate Process if decoupled) ---
239239
close(status_pipe[0]); // Child only writes to the status pipe.
240240
if (pipe_to_stdin) {
241241
close(parent_lifetime_pipe[1]); // Child only reads from the lifetime pipe.
@@ -245,22 +245,27 @@ class ProcessFD {
245245

246246
if (decouple) {
247247
if (fork() != 0) {
248-
// Intermediate parent exits. Grandchild is orphaned.
248+
// --- Intermediate Parent ---
249+
// This process must close ALL inherited pipe FDs before exiting
250+
// to prevent leaking them to the grandchild or holding pipes open.
251+
close(status_pipe[1]);
252+
if (pipe_to_stdin) {
253+
close(parent_lifetime_pipe[0]);
254+
}
249255
_exit(0);
250256
}
251257
}
252258

253-
// Grandchild (if decoupled) or direct child (if not) continues here.
259+
// --- Grandchild (if decoupled) or Direct Child (if not) ---
254260
if (pipe_to_stdin) {
255261
if (dup2(parent_lifetime_pipe[0], STDIN_FILENO) == -1) {
256-
// If dup2 fails, we can't health check, so exit.
257262
_exit(errno);
258263
}
264+
// After dup2, this original FD is no longer needed.
259265
close(parent_lifetime_pipe[0]);
260266
}
261267

262-
// Set FD_CLOEXEC on the status pipe's write end. If execve succeeds,
263-
// this FD will be closed, and the parent will read EOF.
268+
// If execve succeeds, this FD will be closed automatically.
264269
SetFdCloseOnExec(status_pipe[1]);
265270

266271
if (decouple) {
@@ -278,43 +283,62 @@ class ProcessFD {
278283
_exit(err);
279284

280285
} else if (pid > 0) {
281-
// Parent process.
286+
// --- Parent Process ---
282287
close(status_pipe[1]); // Parent only reads from the status pipe.
283288
if (pipe_to_stdin) {
284289
close(parent_lifetime_pipe[0]); // Parent only writes to the lifetime pipe.
285290
}
286291

287292
if (!decouple) {
293+
// Simple case for non-decoupled process
288294
int err_from_child;
289295
ssize_t bytes_read =
290296
ReadBytesFromFd(status_pipe[0], &err_from_child, sizeof(err_from_child));
291297
if (bytes_read == 0) {
292298
// Success: exec'd, pipe closed by CLOEXEC.
293299
ec = std::error_code();
294-
} else if (bytes_read == sizeof(err_from_child)) {
295-
ec = std::error_code(err_from_child, std::system_category());
296-
waitpid(pid, NULL, 0);
297-
pid = -1;
298300
} else {
299-
ec = std::error_code(bytes_read < 0 ? errno : EPIPE, std::system_category());
301+
// Failure: got an error or pipe broke.
302+
ec = std::error_code(bytes_read > 0 ? err_from_child : errno,
303+
std::system_category());
300304
waitpid(pid, NULL, 0);
301305
pid = -1;
302306
}
303307
close(status_pipe[0]);
304-
} else { // Decoupled
305-
waitpid(pid, NULL, 0); // Reap the intermediate child.
306-
ssize_t bytes_read = ReadBytesFromFd(status_pipe[0], &pid, sizeof(pid));
307-
if (bytes_read != sizeof(pid)) {
308-
// Failed to get grandchild's PID.
308+
} else {
309+
waitpid(pid, NULL, 0); // Reap intermediate child.
310+
311+
// Read the grandchild's PID from the pipe.
312+
ssize_t bytes_read_pid = ReadBytesFromFd(status_pipe[0], &pid, sizeof(pid));
313+
if (bytes_read_pid != sizeof(pid)) {
314+
// If we can't get the PID, it's a startup failure.
309315
ec = std::error_code(ECHILD, std::system_category());
310316
pid = -1;
317+
close(status_pipe[0]);
311318
} else {
312-
// Use the status pipe as the lifetime tracking fd.
313-
fd = status_pipe[0];
319+
// We got the PID. Now, do a NON-BLOCKING read to check for an exec error.
320+
int flags = fcntl(status_pipe[0], F_GETFL, 0);
321+
fcntl(status_pipe[0], F_SETFL, flags | O_NONBLOCK);
322+
int exec_errno = 0;
323+
ssize_t bytes_read_errno =
324+
read(status_pipe[0], &exec_errno, sizeof(exec_errno));
325+
fcntl(status_pipe[0], F_SETFL, flags); // Restore original flags.
326+
327+
if (bytes_read_errno == sizeof(exec_errno)) {
328+
// We got an error code back. Launch failed.
329+
ec = std::error_code(exec_errno, std::system_category());
330+
pid = -1;
331+
close(status_pipe[0]);
332+
} else {
333+
// No error code was present. Launch was successful.
334+
// Use the status pipe for lifetime tracking.
335+
ec = std::error_code();
336+
fd = status_pipe[0];
337+
}
314338
}
315339
}
316340
} else {
317-
// fork() failed.
341+
// --- Fork Failed ---
318342
ec = std::error_code(errno, std::system_category());
319343
close(status_pipe[0]);
320344
close(status_pipe[1]);

0 commit comments

Comments
 (0)