Skip to content

Commit 4853457

Browse files
author
Ralph Castain
committed
The RML posted recvs are controlled by the async progress thread when in an application process. The call to finalize and close the RML is done from the main thread, and so we need to shift the actual destruct of the posted recv list to the async thread for handling or else we encounter a race condition when accessing the posted recvs.
Thanks to Gilles for providing the required debug info
1 parent ec3a383 commit 4853457

File tree

1 file changed

+28
-5
lines changed

1 file changed

+28
-5
lines changed

orte/mca/rml/base/rml_base_frame.c

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#include "orte/mca/rml/rml.h"
2828
#include "orte/mca/state/state.h"
29+
#include "orte/runtime/orte_wait.h"
2930
#include "orte/util/name_fns.h"
3031

3132
#include "orte/mca/rml/base/base.h"
@@ -74,14 +75,36 @@ static int orte_rml_base_register(mca_base_register_flag_t flags)
7475
return ORTE_SUCCESS;
7576
}
7677

77-
static int orte_rml_base_close(void)
78+
static void cleanup(int sd, short args, void *cbdata)
7879
{
79-
opal_list_item_t *item;
80+
bool *active = (bool*)cbdata;
8081

81-
while (NULL != (item = opal_list_remove_first(&orte_rml_base.posted_recvs))) {
82-
OBJ_RELEASE(item);
82+
OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs);
83+
if (NULL != active) {
84+
*active = false;
8385
}
84-
OBJ_DESTRUCT(&orte_rml_base.posted_recvs);
86+
}
87+
88+
static int orte_rml_base_close(void)
89+
{
90+
bool active;
91+
92+
/* because the RML posted recvs list is in a separate
93+
* async thread for apps, we can't just destruct it here.
94+
* Instead, we push it into that event thread and destruct
95+
* it there */
96+
if (ORTE_PROC_IS_APP) {
97+
opal_event_t ev;
98+
active = true;
99+
opal_event_set(orte_event_base, &ev, -1,
100+
OPAL_EV_WRITE, cleanup, &active);
101+
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
102+
opal_event_active(&ev, OPAL_EV_WRITE, 1);
103+
ORTE_WAIT_FOR_COMPLETION(active);
104+
} else {
105+
/* we can call the destruct directly */
106+
cleanup(0, 0, NULL);
107+
}
85108

86109
OPAL_TIMING_REPORT(orte_rml_base.timing, &tm_rml);
87110
OBJ_DESTRUCT(&orte_rml_base.open_channels);

0 commit comments

Comments
 (0)