@@ -23,50 +23,46 @@ struct devcd_entry {
2323 void * data ;
2424 size_t datalen ;
2525 /*
26- * Here, mutex is required to serialize the calls to del_wk work between
27- * user/kernel space which happens when devcd is added with device_add()
28- * and that sends uevent to user space. User space reads the uevents,
29- * and calls to devcd_data_write() which try to modify the work which is
30- * not even initialized/queued from devcoredump.
26+ * There are 2 races for which mutex is required.
3127 *
28+ * The first race is between device creation and userspace writing to
29+ * schedule immediately destruction.
3230 *
31+ * This race is handled by arming the timer before device creation, but
32+ * when device creation fails the timer still exists.
3333 *
34- * cpu0(X) cpu1(Y)
34+ * To solve this, hold the mutex during device_add(), and set
35+ * init_completed on success before releasing the mutex.
3536 *
36- * dev_coredump() uevent sent to user space
37- * device_add() ======================> user space process Y reads the
38- * uevents writes to devcd fd
39- * which results into writes to
37+ * That way the timer will never fire until device_add() is called,
38+ * it will do nothing if init_completed is not set. The timer is also
39+ * cancelled in that case.
4040 *
41- * devcd_data_write()
42- * mod_delayed_work()
43- * try_to_grab_pending()
44- * del_timer()
45- * debug_assert_init()
46- * INIT_DELAYED_WORK()
47- * schedule_delayed_work()
48- *
49- *
50- * Also, mutex alone would not be enough to avoid scheduling of
51- * del_wk work after it get flush from a call to devcd_free()
52- * mentioned as below.
53- *
54- * disabled_store()
55- * devcd_free()
56- * mutex_lock() devcd_data_write()
57- * flush_delayed_work()
58- * mutex_unlock()
59- * mutex_lock()
60- * mod_delayed_work()
61- * mutex_unlock()
62- * So, delete_work flag is required.
41+ * The second race involves multiple parallel invocations of devcd_free(),
42+ * add a deleted flag so only 1 can call the destructor.
6343 */
6444 struct mutex mutex ;
65- bool delete_work ;
45+ bool init_completed , deleted ;
6646 struct module * owner ;
6747 ssize_t (* read )(char * buffer , loff_t offset , size_t count ,
6848 void * data , size_t datalen );
6949 void (* free )(void * data );
50+ /*
51+ * If nothing interferes and device_add() was returns success,
52+ * del_wk will destroy the device after the timer fires.
53+ *
54+ * Multiple userspace processes can interfere in the working of the timer:
55+ * - Writing to the coredump will reschedule the timer to run immediately,
56+ * if still armed.
57+ *
58+ * This is handled by using "if (cancel_delayed_work()) {
59+ * schedule_delayed_work() }", to prevent re-arming after having
60+ * been previously fired.
61+ * - Writing to /sys/class/devcoredump/disabled will destroy the
62+ * coredump synchronously.
63+ * This is handled by using disable_delayed_work_sync(), and then
64+ * checking if deleted flag is set with &devcd->mutex held.
65+ */
7066 struct delayed_work del_wk ;
7167 struct device * failing_dev ;
7268};
@@ -95,14 +91,27 @@ static void devcd_dev_release(struct device *dev)
9591 kfree (devcd );
9692}
9793
94+ static void __devcd_del (struct devcd_entry * devcd )
95+ {
96+ devcd -> deleted = true;
97+ device_del (& devcd -> devcd_dev );
98+ put_device (& devcd -> devcd_dev );
99+ }
100+
98101static void devcd_del (struct work_struct * wk )
99102{
100103 struct devcd_entry * devcd ;
104+ bool init_completed ;
101105
102106 devcd = container_of (wk , struct devcd_entry , del_wk .work );
103107
104- device_del (& devcd -> devcd_dev );
105- put_device (& devcd -> devcd_dev );
108+ /* devcd->mutex serializes against dev_coredumpm_timeout */
109+ mutex_lock (& devcd -> mutex );
110+ init_completed = devcd -> init_completed ;
111+ mutex_unlock (& devcd -> mutex );
112+
113+ if (init_completed )
114+ __devcd_del (devcd );
106115}
107116
108117static ssize_t devcd_data_read (struct file * filp , struct kobject * kobj ,
@@ -122,12 +131,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
122131 struct device * dev = kobj_to_dev (kobj );
123132 struct devcd_entry * devcd = dev_to_devcd (dev );
124133
125- mutex_lock ( & devcd -> mutex );
126- if (! devcd -> delete_work ) {
127- devcd -> delete_work = true;
128- mod_delayed_work ( system_wq , & devcd -> del_wk , 0 );
129- }
130- mutex_unlock (& devcd -> mutex );
134+ /*
135+ * Although it's tempting to use mod_delayed work here,
136+ * that will cause a reschedule if the timer already fired.
137+ */
138+ if ( cancel_delayed_work ( & devcd -> del_wk ))
139+ schedule_delayed_work (& devcd -> del_wk , 0 );
131140
132141 return count ;
133142}
@@ -155,11 +164,21 @@ static int devcd_free(struct device *dev, void *data)
155164{
156165 struct devcd_entry * devcd = dev_to_devcd (dev );
157166
167+ /*
168+ * To prevent a race with devcd_data_write(), disable work and
169+ * complete manually instead.
170+ *
171+ * We cannot rely on the return value of
172+ * disable_delayed_work_sync() here, because it might be in the
173+ * middle of a cancel_delayed_work + schedule_delayed_work pair.
174+ *
175+ * devcd->mutex here guards against multiple parallel invocations
176+ * of devcd_free().
177+ */
178+ disable_delayed_work_sync (& devcd -> del_wk );
158179 mutex_lock (& devcd -> mutex );
159- if (!devcd -> delete_work )
160- devcd -> delete_work = true;
161-
162- flush_delayed_work (& devcd -> del_wk );
180+ if (!devcd -> deleted )
181+ __devcd_del (devcd );
163182 mutex_unlock (& devcd -> mutex );
164183 return 0 ;
165184}
@@ -183,12 +202,10 @@ static ssize_t disabled_show(const struct class *class, const struct class_attri
183202 * put_device() <- last reference
184203 * error = fn(dev, data) devcd_dev_release()
185204 * devcd_free(dev, data) kfree(devcd)
186- * mutex_lock(&devcd->mutex);
187205 *
188206 *
189- * In the above diagram, It looks like disabled_store() would be racing with parallely
190- * running devcd_del() and result in memory abort while acquiring devcd->mutex which
191- * is called after kfree of devcd memory after dropping its last reference with
207+ * In the above diagram, it looks like disabled_store() would be racing with parallelly
208+ * running devcd_del() and result in memory abort after dropping its last reference with
192209 * put_device(). However, this will not happens as fn(dev, data) runs
193210 * with its own reference to device via klist_node so it is not its last reference.
194211 * so, above situation would not occur.
@@ -376,7 +393,7 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
376393 devcd -> read = read ;
377394 devcd -> free = free ;
378395 devcd -> failing_dev = get_device (dev );
379- devcd -> delete_work = false;
396+ devcd -> deleted = false;
380397
381398 mutex_init (& devcd -> mutex );
382399 device_initialize (& devcd -> devcd_dev );
@@ -385,8 +402,14 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
385402 atomic_inc_return (& devcd_count ));
386403 devcd -> devcd_dev .class = & devcd_class ;
387404
388- mutex_lock (& devcd -> mutex );
389405 dev_set_uevent_suppress (& devcd -> devcd_dev , true);
406+
407+ /* devcd->mutex prevents devcd_del() completing until init finishes */
408+ mutex_lock (& devcd -> mutex );
409+ devcd -> init_completed = false;
410+ INIT_DELAYED_WORK (& devcd -> del_wk , devcd_del );
411+ schedule_delayed_work (& devcd -> del_wk , timeout );
412+
390413 if (device_add (& devcd -> devcd_dev ))
391414 goto put_device ;
392415
@@ -403,13 +426,20 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
403426
404427 dev_set_uevent_suppress (& devcd -> devcd_dev , false);
405428 kobject_uevent (& devcd -> devcd_dev .kobj , KOBJ_ADD );
406- INIT_DELAYED_WORK (& devcd -> del_wk , devcd_del );
407- schedule_delayed_work (& devcd -> del_wk , timeout );
429+
430+ /*
431+ * Safe to run devcd_del() now that we are done with devcd_dev.
432+ * Alternatively we could have taken a ref on devcd_dev before
433+ * dropping the lock.
434+ */
435+ devcd -> init_completed = true;
408436 mutex_unlock (& devcd -> mutex );
409437 return ;
410438 put_device :
411- put_device (& devcd -> devcd_dev );
412439 mutex_unlock (& devcd -> mutex );
440+ cancel_delayed_work_sync (& devcd -> del_wk );
441+ put_device (& devcd -> devcd_dev );
442+
413443 put_module :
414444 module_put (owner );
415445 free :
0 commit comments