From 918a8adc0523e064d05af71850b8c5f9726a2cb1 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 10:50:45 -0600 Subject: [PATCH 01/57] winesync: Introduce the winesync driver and character device. Signed-off-by: Kai Krakow --- drivers/misc/Kconfig | 11 +++++++ drivers/misc/Makefile | 1 + drivers/misc/winesync.c | 64 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 drivers/misc/winesync.c diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index cadd4a820c0336..128d498750c355 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -562,6 +562,17 @@ config TPS6594_PFSM This driver can also be built as a module. If so, the module will be called tps6594-pfsm. +config WINESYNC + tristate "Synchronization primitives for Wine" + help + This module provides kernel support for synchronization primitives + used by Wine. It is not a hardware driver. + + To compile this driver as a module, choose M here: the + module will be called winesync. + + If unsure, say N. + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index f2a4d1ff65d46a..e7824ea71db493 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_PVPANIC) += pvpanic/ obj-$(CONFIG_UACCE) += uacce/ obj-$(CONFIG_XILINX_SDFEC) += xilinx_sdfec.o obj-$(CONFIG_HISI_HIKEY_USB) += hisi_hikey_usb.o +obj-$(CONFIG_WINESYNC) += winesync.o obj-$(CONFIG_HI6421V600_IRQ) += hi6421v600-irq.o obj-$(CONFIG_OPEN_DICE) += open-dice.o obj-$(CONFIG_GP_PCI1XXXX) += mchp_pci1xxxx/ diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c new file mode 100644 index 00000000000000..111f33c5676e6c --- /dev/null +++ b/drivers/misc/winesync.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * winesync.c - Kernel driver for Wine synchronization primitives + * + * Copyright (C) 2021 Zebediah Figura + */ + +#include +#include +#include + +#define WINESYNC_NAME "winesync" + +static int winesync_char_open(struct inode *inode, struct file *file) +{ + return nonseekable_open(inode, file); +} + +static int winesync_char_release(struct inode *inode, struct file *file) +{ + return 0; +} + +static long winesync_char_ioctl(struct file *file, unsigned int cmd, + unsigned long parm) +{ + switch (cmd) { + default: + return -ENOSYS; + } +} + +static const struct file_operations winesync_fops = { + .owner = THIS_MODULE, + .open = winesync_char_open, + .release = winesync_char_release, + .unlocked_ioctl = winesync_char_ioctl, + .compat_ioctl = winesync_char_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice winesync_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = WINESYNC_NAME, + .fops = &winesync_fops, +}; + +static int __init winesync_init(void) +{ + return misc_register(&winesync_misc); +} + +static void __exit winesync_exit(void) +{ + misc_deregister(&winesync_misc); +} + +module_init(winesync_init); +module_exit(winesync_exit); + +MODULE_AUTHOR("Zebediah Figura"); +MODULE_DESCRIPTION("Kernel driver for Wine synchronization primitives"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("devname:" WINESYNC_NAME); From 122ea73f4268ab395fd5cfc720c27815f78c68f5 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 10:57:06 -0600 Subject: [PATCH 02/57] winesync: Reserve a minor device number and ioctl range. Signed-off-by: Kai Krakow --- Documentation/admin-guide/devices.txt | 3 ++- Documentation/userspace-api/ioctl/ioctl-number.rst | 2 ++ drivers/misc/winesync.c | 3 ++- include/linux/miscdevice.h | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt index 8390549235304f..7ae111fd02e72c 100644 --- a/Documentation/admin-guide/devices.txt +++ b/Documentation/admin-guide/devices.txt @@ -376,8 +376,9 @@ 240 = /dev/userio Serio driver testing device 241 = /dev/vhost-vsock Host kernel driver for virtio vsock 242 = /dev/rfkill Turning off radio transmissions (rfkill) + 243 = /dev/winesync Wine synchronization primitive device - 243-254 Reserved for local use + 244-254 Reserved for local use 255 Reserved for MISC_DYNAMIC_MINOR 11 char Raw keyboard device (Linux/SPARC only) diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 4ea5b837399ad1..825e95da7e95eb 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -375,6 +375,8 @@ Code Seq# Include File Comments 0xF6 all LTTng Linux Trace Toolkit Next Generation +0xF7 00-0F uapi/linux/winesync.h Wine synchronization primitives + 0xF8 all arch/x86/include/uapi/asm/amd_hsmp.h AMD HSMP EPYC system management interface driver 0xFD all linux/dm-ioctl.h diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 111f33c5676e6c..85cb6ccaa077db 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -40,7 +40,7 @@ static const struct file_operations winesync_fops = { }; static struct miscdevice winesync_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = WINESYNC_MINOR, .name = WINESYNC_NAME, .fops = &winesync_fops, }; @@ -62,3 +62,4 @@ MODULE_AUTHOR("Zebediah Figura"); MODULE_DESCRIPTION("Kernel driver for Wine synchronization primitives"); MODULE_LICENSE("GPL"); MODULE_ALIAS("devname:" WINESYNC_NAME); +MODULE_ALIAS_MISCDEV(WINESYNC_MINOR); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index c0fea6ca507681..36fc5d5315a414 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -71,6 +71,7 @@ #define USERIO_MINOR 240 #define VHOST_VSOCK_MINOR 241 #define RFKILL_MINOR 242 +#define WINESYNC_MINOR 243 #define MISC_DYNAMIC_MINOR 255 struct device; From edb5a4bfcaa61b9d9fbcf6dc7315d0361013edf1 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:15:39 -0600 Subject: [PATCH 03/57] winesync: Introduce WINESYNC_IOC_CREATE_SEM and WINESYNC_IOC_DELETE. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 117 ++++++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 25 ++++++++ 2 files changed, 142 insertions(+) create mode 100644 include/uapi/linux/winesync.h diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 85cb6ccaa077db..36e31bbe039037 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -8,23 +8,140 @@ #include #include #include +#include +#include +#include #define WINESYNC_NAME "winesync" +enum winesync_type { + WINESYNC_TYPE_SEM, +}; + +struct winesync_obj { + struct rcu_head rhead; + struct kref refcount; + + enum winesync_type type; + + union { + struct { + __u32 count; + __u32 max; + } sem; + } u; +}; + +struct winesync_device { + struct xarray objects; +}; + +static void destroy_obj(struct kref *ref) +{ + struct winesync_obj *obj = container_of(ref, struct winesync_obj, refcount); + + kfree_rcu(obj, rhead); +} + +static void put_obj(struct winesync_obj *obj) +{ + kref_put(&obj->refcount, destroy_obj); +} + static int winesync_char_open(struct inode *inode, struct file *file) { + struct winesync_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + xa_init_flags(&dev->objects, XA_FLAGS_ALLOC); + + file->private_data = dev; return nonseekable_open(inode, file); } static int winesync_char_release(struct inode *inode, struct file *file) { + struct winesync_device *dev = file->private_data; + struct winesync_obj *obj; + unsigned long id; + + xa_for_each(&dev->objects, id, obj) + put_obj(obj); + + xa_destroy(&dev->objects); + + kfree(dev); + + return 0; +} + +static void init_obj(struct winesync_obj *obj) +{ + kref_init(&obj->refcount); +} + +static int winesync_create_sem(struct winesync_device *dev, void __user *argp) +{ + struct winesync_sem_args __user *user_args = argp; + struct winesync_sem_args args; + struct winesync_obj *sem; + __u32 id; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + if (args.count > args.max) + return -EINVAL; + + sem = kzalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return -ENOMEM; + + init_obj(sem); + sem->type = WINESYNC_TYPE_SEM; + sem->u.sem.count = args.count; + sem->u.sem.max = args.max; + + ret = xa_alloc(&dev->objects, &id, sem, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + kfree(sem); + return ret; + } + + return put_user(id, &user_args->sem); +} + +static int winesync_delete(struct winesync_device *dev, void __user *argp) +{ + struct winesync_obj *obj; + __u32 id; + + if (get_user(id, (__u32 __user *)argp)) + return -EFAULT; + + obj = xa_erase(&dev->objects, id); + if (!obj) + return -EINVAL; + + put_obj(obj); return 0; } static long winesync_char_ioctl(struct file *file, unsigned int cmd, unsigned long parm) { + struct winesync_device *dev = file->private_data; + void __user *argp = (void __user *)parm; + switch (cmd) { + case WINESYNC_IOC_CREATE_SEM: + return winesync_create_sem(dev, argp); + case WINESYNC_IOC_DELETE: + return winesync_delete(dev, argp); default: return -ENOSYS; } diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h new file mode 100644 index 00000000000000..aabb491f39d2dc --- /dev/null +++ b/include/uapi/linux/winesync.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Kernel support for Wine synchronization primitives + * + * Copyright (C) 2021 Zebediah Figura + */ + +#ifndef __LINUX_WINESYNC_H +#define __LINUX_WINESYNC_H + +#include + +struct winesync_sem_args { + __u32 sem; + __u32 count; + __u32 max; +}; + +#define WINESYNC_IOC_BASE 0xf7 + +#define WINESYNC_IOC_CREATE_SEM _IOWR(WINESYNC_IOC_BASE, 0, \ + struct winesync_sem_args) +#define WINESYNC_IOC_DELETE _IOW (WINESYNC_IOC_BASE, 1, __u32) + +#endif From ef2cef8fee82a8caa342b5cbf3c27293910f5747 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:22:42 -0600 Subject: [PATCH 04/57] winesync: Introduce WINESYNC_IOC_PUT_SEM. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 76 +++++++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 2 + 2 files changed, 78 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 36e31bbe039037..84b5a5c9e0ce72 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -21,9 +21,11 @@ enum winesync_type { struct winesync_obj { struct rcu_head rhead; struct kref refcount; + spinlock_t lock; enum winesync_type type; + /* The following fields are protected by the object lock. */ union { struct { __u32 count; @@ -36,6 +38,19 @@ struct winesync_device { struct xarray objects; }; +static struct winesync_obj *get_obj(struct winesync_device *dev, __u32 id) +{ + struct winesync_obj *obj; + + rcu_read_lock(); + obj = xa_load(&dev->objects, id); + if (obj && !kref_get_unless_zero(&obj->refcount)) + obj = NULL; + rcu_read_unlock(); + + return obj; +} + static void destroy_obj(struct kref *ref) { struct winesync_obj *obj = container_of(ref, struct winesync_obj, refcount); @@ -48,6 +63,18 @@ static void put_obj(struct winesync_obj *obj) kref_put(&obj->refcount, destroy_obj); } +static struct winesync_obj *get_obj_typed(struct winesync_device *dev, __u32 id, + enum winesync_type type) +{ + struct winesync_obj *obj = get_obj(dev, id); + + if (obj && obj->type != type) { + put_obj(obj); + return NULL; + } + return obj; +} + static int winesync_char_open(struct inode *inode, struct file *file) { struct winesync_device *dev; @@ -81,6 +108,7 @@ static int winesync_char_release(struct inode *inode, struct file *file) static void init_obj(struct winesync_obj *obj) { kref_init(&obj->refcount); + spin_lock_init(&obj->lock); } static int winesync_create_sem(struct winesync_device *dev, void __user *argp) @@ -131,6 +159,52 @@ static int winesync_delete(struct winesync_device *dev, void __user *argp) return 0; } +/* + * Actually change the semaphore state, returning -EOVERFLOW if it is made + * invalid. + */ +static int put_sem_state(struct winesync_obj *sem, __u32 count) +{ + lockdep_assert_held(&sem->lock); + + if (sem->u.sem.count + count < sem->u.sem.count || + sem->u.sem.count + count > sem->u.sem.max) + return -EOVERFLOW; + + sem->u.sem.count += count; + return 0; +} + +static int winesync_put_sem(struct winesync_device *dev, void __user *argp) +{ + struct winesync_sem_args __user *user_args = argp; + struct winesync_sem_args args; + struct winesync_obj *sem; + __u32 prev_count; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + sem = get_obj_typed(dev, args.sem, WINESYNC_TYPE_SEM); + if (!sem) + return -EINVAL; + + spin_lock(&sem->lock); + + prev_count = sem->u.sem.count; + ret = put_sem_state(sem, args.count); + + spin_unlock(&sem->lock); + + put_obj(sem); + + if (!ret && put_user(prev_count, &user_args->count)) + ret = -EFAULT; + + return ret; +} + static long winesync_char_ioctl(struct file *file, unsigned int cmd, unsigned long parm) { @@ -142,6 +216,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_create_sem(dev, argp); case WINESYNC_IOC_DELETE: return winesync_delete(dev, argp); + case WINESYNC_IOC_PUT_SEM: + return winesync_put_sem(dev, argp); default: return -ENOSYS; } diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index aabb491f39d2dc..7681a168eb92ec 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -21,5 +21,7 @@ struct winesync_sem_args { #define WINESYNC_IOC_CREATE_SEM _IOWR(WINESYNC_IOC_BASE, 0, \ struct winesync_sem_args) #define WINESYNC_IOC_DELETE _IOW (WINESYNC_IOC_BASE, 1, __u32) +#define WINESYNC_IOC_PUT_SEM _IOWR(WINESYNC_IOC_BASE, 2, \ + struct winesync_sem_args) #endif From 363432b43aebacb81fa83ee6846b939d62e122d1 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:31:44 -0600 Subject: [PATCH 05/57] winesync: Introduce WINESYNC_IOC_WAIT_ANY. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 226 ++++++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 11 ++ 2 files changed, 237 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 84b5a5c9e0ce72..d9b5ab159520df 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -23,6 +23,8 @@ struct winesync_obj { struct kref refcount; spinlock_t lock; + struct list_head any_waiters; + enum winesync_type type; /* The following fields are protected by the object lock. */ @@ -34,6 +36,28 @@ struct winesync_obj { } u; }; +struct winesync_q_entry { + struct list_head node; + struct winesync_q *q; + struct winesync_obj *obj; + __u32 index; +}; + +struct winesync_q { + struct task_struct *task; + __u32 owner; + + /* + * Protected via atomic_cmpxchg(). Only the thread that wins the + * compare-and-swap may actually change object states and wake this + * task. + */ + atomic_t signaled; + + __u32 count; + struct winesync_q_entry entries[]; +}; + struct winesync_device { struct xarray objects; }; @@ -109,6 +133,26 @@ static void init_obj(struct winesync_obj *obj) { kref_init(&obj->refcount); spin_lock_init(&obj->lock); + INIT_LIST_HEAD(&obj->any_waiters); +} + +static void try_wake_any_sem(struct winesync_obj *sem) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&sem->lock); + + list_for_each_entry(entry, &sem->any_waiters, node) { + struct winesync_q *q = entry->q; + + if (!sem->u.sem.count) + break; + + if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) { + sem->u.sem.count--; + wake_up_process(q->task); + } + } } static int winesync_create_sem(struct winesync_device *dev, void __user *argp) @@ -194,6 +238,8 @@ static int winesync_put_sem(struct winesync_device *dev, void __user *argp) prev_count = sem->u.sem.count; ret = put_sem_state(sem, args.count); + if (!ret) + try_wake_any_sem(sem); spin_unlock(&sem->lock); @@ -205,6 +251,184 @@ static int winesync_put_sem(struct winesync_device *dev, void __user *argp) return ret; } +static int winesync_schedule(const struct winesync_q *q, ktime_t *timeout) +{ + int ret = 0; + + do { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_read(&q->signaled) != -1) { + ret = 0; + break; + } + ret = schedule_hrtimeout(timeout, HRTIMER_MODE_ABS); + } while (ret < 0); + __set_current_state(TASK_RUNNING); + + return ret; +} + +/* + * Allocate and initialize the winesync_q structure, but do not queue us yet. + * Also, calculate the relative timeout. + */ +static int setup_wait(struct winesync_device *dev, + const struct winesync_wait_args *args, + ktime_t *ret_timeout, struct winesync_q **ret_q) +{ + const __u32 count = args->count; + struct winesync_q *q; + ktime_t timeout = 0; + __u32 *ids; + __u32 i, j; + + if (!args->owner || args->pad) + return -EINVAL; + + if (args->timeout) { + struct timespec64 to; + + if (get_timespec64(&to, u64_to_user_ptr(args->timeout))) + return -EFAULT; + if (!timespec64_valid(&to)) + return -EINVAL; + + timeout = timespec64_to_ns(&to); + } + + ids = kmalloc_array(count, sizeof(*ids), GFP_KERNEL); + if (!ids) + return -ENOMEM; + if (copy_from_user(ids, u64_to_user_ptr(args->objs), + array_size(count, sizeof(*ids)))) { + kfree(ids); + return -EFAULT; + } + + q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); + if (!q) { + kfree(ids); + return -ENOMEM; + } + q->task = current; + q->owner = args->owner; + atomic_set(&q->signaled, -1); + q->count = count; + + for (i = 0; i < count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = get_obj(dev, ids[i]); + + if (!obj) + goto err; + + entry->obj = obj; + entry->q = q; + entry->index = i; + } + + kfree(ids); + + *ret_q = q; + *ret_timeout = timeout; + return 0; + +err: + for (j = 0; j < i; j++) + put_obj(q->entries[j].obj); + kfree(ids); + kfree(q); + return -EINVAL; +} + +static void try_wake_any_obj(struct winesync_obj *obj) +{ + switch (obj->type) { + case WINESYNC_TYPE_SEM: + try_wake_any_sem(obj); + break; + } +} + +static int winesync_wait_any(struct winesync_device *dev, void __user *argp) +{ + struct winesync_wait_args args; + struct winesync_q *q; + ktime_t timeout; + int signaled; + __u32 i; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + ret = setup_wait(dev, &args, &timeout, &q); + if (ret < 0) + return ret; + + /* queue ourselves */ + + for (i = 0; i < args.count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_add_tail(&entry->node, &obj->any_waiters); + spin_unlock(&obj->lock); + } + + /* check if we are already signaled */ + + for (i = 0; i < args.count; i++) { + struct winesync_obj *obj = q->entries[i].obj; + + if (atomic_read(&q->signaled) != -1) + break; + + spin_lock(&obj->lock); + try_wake_any_obj(obj); + spin_unlock(&obj->lock); + } + + /* sleep */ + + ret = winesync_schedule(q, args.timeout ? &timeout : NULL); + + /* and finally, unqueue */ + + for (i = 0; i < args.count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_del(&entry->node); + spin_unlock(&obj->lock); + + put_obj(obj); + } + + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct winesync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ + ret = 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; + } else if (!ret) { + ret = -ETIMEDOUT; + } + + kfree(q); + return ret; +} + static long winesync_char_ioctl(struct file *file, unsigned int cmd, unsigned long parm) { @@ -218,6 +442,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_delete(dev, argp); case WINESYNC_IOC_PUT_SEM: return winesync_put_sem(dev, argp); + case WINESYNC_IOC_WAIT_ANY: + return winesync_wait_any(dev, argp); default: return -ENOSYS; } diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 7681a168eb92ec..f57ebfbe1dd928 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -16,6 +16,15 @@ struct winesync_sem_args { __u32 max; }; +struct winesync_wait_args { + __u64 timeout; + __u64 objs; + __u32 count; + __u32 owner; + __u32 index; + __u32 pad; +}; + #define WINESYNC_IOC_BASE 0xf7 #define WINESYNC_IOC_CREATE_SEM _IOWR(WINESYNC_IOC_BASE, 0, \ @@ -23,5 +32,7 @@ struct winesync_sem_args { #define WINESYNC_IOC_DELETE _IOW (WINESYNC_IOC_BASE, 1, __u32) #define WINESYNC_IOC_PUT_SEM _IOWR(WINESYNC_IOC_BASE, 2, \ struct winesync_sem_args) +#define WINESYNC_IOC_WAIT_ANY _IOWR(WINESYNC_IOC_BASE, 3, \ + struct winesync_wait_args) #endif From e616fd44b5adf65565dc8e883a17ffa7069ef63d Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:36:09 -0600 Subject: [PATCH 06/57] winesync: Introduce WINESYNC_IOC_WAIT_ALL. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 242 ++++++++++++++++++++++++++++++++-- include/uapi/linux/winesync.h | 2 + 2 files changed, 236 insertions(+), 8 deletions(-) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index d9b5ab159520df..2b708c5b88a678 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -23,7 +23,34 @@ struct winesync_obj { struct kref refcount; spinlock_t lock; + /* + * any_waiters is protected by the object lock, but all_waiters is + * protected by the device wait_all_lock. + */ struct list_head any_waiters; + struct list_head all_waiters; + + /* + * Hint describing how many tasks are queued on this object in a + * wait-all operation. + * + * Any time we do a wake, we may need to wake "all" waiters as well as + * "any" waiters. In order to atomically wake "all" waiters, we must + * lock all of the objects, and that means grabbing the wait_all_lock + * below (and, due to lock ordering rules, before locking this object). + * However, wait-all is a rare operation, and grabbing the wait-all + * lock for every wake would create unnecessary contention. Therefore we + * first check whether all_hint is zero, and, if it is, we skip trying + * to wake "all" waiters. + * + * This hint isn't protected by any lock. It might change during the + * course of a wake, but there's no meaningful race there; it's only a + * hint. + * + * Since wait requests must originate from user-space threads, we're + * limited here by PID_MAX_LIMIT, so there's no risk of saturation. + */ + atomic_t all_hint; enum winesync_type type; @@ -54,11 +81,25 @@ struct winesync_q { */ atomic_t signaled; + bool all; __u32 count; struct winesync_q_entry entries[]; }; struct winesync_device { + /* + * Wait-all operations must atomically grab all objects, and be totally + * ordered with respect to each other and wait-any operations. If one + * thread is trying to acquire several objects, another thread cannot + * touch the object at the same time. + * + * We achieve this by grabbing multiple object locks at the same time. + * However, this creates a lock ordering problem. To solve that problem, + * wait_all_lock is taken first whenever multiple objects must be locked + * at the same time. + */ + spinlock_t wait_all_lock; + struct xarray objects; }; @@ -107,6 +148,8 @@ static int winesync_char_open(struct inode *inode, struct file *file) if (!dev) return -ENOMEM; + spin_lock_init(&dev->wait_all_lock); + xa_init_flags(&dev->objects, XA_FLAGS_ALLOC); file->private_data = dev; @@ -132,8 +175,82 @@ static int winesync_char_release(struct inode *inode, struct file *file) static void init_obj(struct winesync_obj *obj) { kref_init(&obj->refcount); + atomic_set(&obj->all_hint, 0); spin_lock_init(&obj->lock); INIT_LIST_HEAD(&obj->any_waiters); + INIT_LIST_HEAD(&obj->all_waiters); +} + +static bool is_signaled(struct winesync_obj *obj, __u32 owner) +{ + lockdep_assert_held(&obj->lock); + + switch (obj->type) { + case WINESYNC_TYPE_SEM: + return !!obj->u.sem.count; + } + + WARN(1, "bad object type %#x\n", obj->type); + return false; +} + +/* + * "locked_obj" is an optional pointer to an object which is already locked and + * should not be locked again. This is necessary so that changing an object's + * state and waking it can be a single atomic operation. + */ +static void try_wake_all(struct winesync_device *dev, struct winesync_q *q, + struct winesync_obj *locked_obj) +{ + __u32 count = q->count; + bool can_wake = true; + __u32 i; + + lockdep_assert_held(&dev->wait_all_lock); + if (locked_obj) + lockdep_assert_held(&locked_obj->lock); + + for (i = 0; i < count; i++) { + if (q->entries[i].obj != locked_obj) + spin_lock(&q->entries[i].obj->lock); + } + + for (i = 0; i < count; i++) { + if (!is_signaled(q->entries[i].obj, q->owner)) { + can_wake = false; + break; + } + } + + if (can_wake && atomic_cmpxchg(&q->signaled, -1, 0) == -1) { + for (i = 0; i < count; i++) { + struct winesync_obj *obj = q->entries[i].obj; + + switch (obj->type) { + case WINESYNC_TYPE_SEM: + obj->u.sem.count--; + break; + } + } + wake_up_process(q->task); + } + + for (i = 0; i < count; i++) { + if (q->entries[i].obj != locked_obj) + spin_unlock(&q->entries[i].obj->lock); + } +} + +static void try_wake_all_obj(struct winesync_device *dev, + struct winesync_obj *obj) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&dev->wait_all_lock); + lockdep_assert_held(&obj->lock); + + list_for_each_entry(entry, &obj->all_waiters, node) + try_wake_all(dev, entry->q, obj); } static void try_wake_any_sem(struct winesync_obj *sem) @@ -234,14 +351,29 @@ static int winesync_put_sem(struct winesync_device *dev, void __user *argp) if (!sem) return -EINVAL; - spin_lock(&sem->lock); + if (atomic_read(&sem->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&sem->lock); + + prev_count = sem->u.sem.count; + ret = put_sem_state(sem, args.count); + if (!ret) { + try_wake_all_obj(dev, sem); + try_wake_any_sem(sem); + } - prev_count = sem->u.sem.count; - ret = put_sem_state(sem, args.count); - if (!ret) - try_wake_any_sem(sem); + spin_unlock(&sem->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&sem->lock); - spin_unlock(&sem->lock); + prev_count = sem->u.sem.count; + ret = put_sem_state(sem, args.count); + if (!ret) + try_wake_any_sem(sem); + + spin_unlock(&sem->lock); + } put_obj(sem); @@ -278,7 +410,7 @@ static int winesync_schedule(const struct winesync_q *q, ktime_t *timeout) * Also, calculate the relative timeout. */ static int setup_wait(struct winesync_device *dev, - const struct winesync_wait_args *args, + const struct winesync_wait_args *args, bool all, ktime_t *ret_timeout, struct winesync_q **ret_q) { const __u32 count = args->count; @@ -318,6 +450,7 @@ static int setup_wait(struct winesync_device *dev, q->task = current; q->owner = args->owner; atomic_set(&q->signaled, -1); + q->all = all; q->count = count; for (i = 0; i < count; i++) { @@ -327,6 +460,16 @@ static int setup_wait(struct winesync_device *dev, if (!obj) goto err; + if (all) { + /* Check that the objects are all distinct. */ + for (j = 0; j < i; j++) { + if (obj == q->entries[j].obj) { + put_obj(obj); + goto err; + } + } + } + entry->obj = obj; entry->q = q; entry->index = i; @@ -367,7 +510,7 @@ static int winesync_wait_any(struct winesync_device *dev, void __user *argp) if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; - ret = setup_wait(dev, &args, &timeout, &q); + ret = setup_wait(dev, &args, false, &timeout, &q); if (ret < 0) return ret; @@ -429,6 +572,87 @@ static int winesync_wait_any(struct winesync_device *dev, void __user *argp) return ret; } +static int winesync_wait_all(struct winesync_device *dev, void __user *argp) +{ + struct winesync_wait_args args; + struct winesync_q *q; + ktime_t timeout; + int signaled; + __u32 i; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + ret = setup_wait(dev, &args, true, &timeout, &q); + if (ret < 0) + return ret; + + /* queue ourselves */ + + spin_lock(&dev->wait_all_lock); + + for (i = 0; i < args.count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + atomic_inc(&obj->all_hint); + + /* + * obj->all_waiters is protected by dev->wait_all_lock rather + * than obj->lock, so there is no need to acquire it here. + */ + list_add_tail(&entry->node, &obj->all_waiters); + } + + /* check if we are already signaled */ + + try_wake_all(dev, q, NULL); + + spin_unlock(&dev->wait_all_lock); + + /* sleep */ + + ret = winesync_schedule(q, args.timeout ? &timeout : NULL); + + /* and finally, unqueue */ + + spin_lock(&dev->wait_all_lock); + + for (i = 0; i < args.count; i++) { + struct winesync_q_entry *entry = &q->entries[i]; + struct winesync_obj *obj = entry->obj; + + /* + * obj->all_waiters is protected by dev->wait_all_lock rather + * than obj->lock, so there is no need to acquire it here. + */ + list_del(&entry->node); + + atomic_dec(&obj->all_hint); + + put_obj(obj); + } + + spin_unlock(&dev->wait_all_lock); + + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct winesync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ + ret = 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; + } else if (!ret) { + ret = -ETIMEDOUT; + } + + kfree(q); + return ret; +} + static long winesync_char_ioctl(struct file *file, unsigned int cmd, unsigned long parm) { @@ -442,6 +666,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_delete(dev, argp); case WINESYNC_IOC_PUT_SEM: return winesync_put_sem(dev, argp); + case WINESYNC_IOC_WAIT_ALL: + return winesync_wait_all(dev, argp); case WINESYNC_IOC_WAIT_ANY: return winesync_wait_any(dev, argp); default: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index f57ebfbe1dd928..44025a510cb99b 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -34,5 +34,7 @@ struct winesync_wait_args { struct winesync_sem_args) #define WINESYNC_IOC_WAIT_ANY _IOWR(WINESYNC_IOC_BASE, 3, \ struct winesync_wait_args) +#define WINESYNC_IOC_WAIT_ALL _IOWR(WINESYNC_IOC_BASE, 4, \ + struct winesync_wait_args) #endif From e63f7313655d898773a17442522d00c84ec5d78d Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:41:10 -0600 Subject: [PATCH 07/57] winesync: Introduce WINESYNC_IOC_CREATE_MUTEX. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 72 +++++++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 8 ++++ 2 files changed, 80 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 2b708c5b88a678..18eb0597590732 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -16,6 +16,7 @@ enum winesync_type { WINESYNC_TYPE_SEM, + WINESYNC_TYPE_MUTEX, }; struct winesync_obj { @@ -60,6 +61,10 @@ struct winesync_obj { __u32 count; __u32 max; } sem; + struct { + __u32 count; + __u32 owner; + } mutex; } u; }; @@ -188,6 +193,10 @@ static bool is_signaled(struct winesync_obj *obj, __u32 owner) switch (obj->type) { case WINESYNC_TYPE_SEM: return !!obj->u.sem.count; + case WINESYNC_TYPE_MUTEX: + if (obj->u.mutex.owner && obj->u.mutex.owner != owner) + return false; + return obj->u.mutex.count < UINT_MAX; } WARN(1, "bad object type %#x\n", obj->type); @@ -230,6 +239,10 @@ static void try_wake_all(struct winesync_device *dev, struct winesync_q *q, case WINESYNC_TYPE_SEM: obj->u.sem.count--; break; + case WINESYNC_TYPE_MUTEX: + obj->u.mutex.count++; + obj->u.mutex.owner = q->owner; + break; } } wake_up_process(q->task); @@ -272,6 +285,28 @@ static void try_wake_any_sem(struct winesync_obj *sem) } } +static void try_wake_any_mutex(struct winesync_obj *mutex) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&mutex->lock); + + list_for_each_entry(entry, &mutex->any_waiters, node) { + struct winesync_q *q = entry->q; + + if (mutex->u.mutex.count == UINT_MAX) + break; + if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner) + continue; + + if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) { + mutex->u.mutex.count++; + mutex->u.mutex.owner = q->owner; + wake_up_process(q->task); + } + } +} + static int winesync_create_sem(struct winesync_device *dev, void __user *argp) { struct winesync_sem_args __user *user_args = argp; @@ -304,6 +339,38 @@ static int winesync_create_sem(struct winesync_device *dev, void __user *argp) return put_user(id, &user_args->sem); } +static int winesync_create_mutex(struct winesync_device *dev, void __user *argp) +{ + struct winesync_mutex_args __user *user_args = argp; + struct winesync_mutex_args args; + struct winesync_obj *mutex; + __u32 id; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + if (!args.owner != !args.count) + return -EINVAL; + + mutex = kzalloc(sizeof(*mutex), GFP_KERNEL); + if (!mutex) + return -ENOMEM; + + init_obj(mutex); + mutex->type = WINESYNC_TYPE_MUTEX; + mutex->u.mutex.count = args.count; + mutex->u.mutex.owner = args.owner; + + ret = xa_alloc(&dev->objects, &id, mutex, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + kfree(mutex); + return ret; + } + + return put_user(id, &user_args->mutex); +} + static int winesync_delete(struct winesync_device *dev, void __user *argp) { struct winesync_obj *obj; @@ -495,6 +562,9 @@ static void try_wake_any_obj(struct winesync_obj *obj) case WINESYNC_TYPE_SEM: try_wake_any_sem(obj); break; + case WINESYNC_TYPE_MUTEX: + try_wake_any_mutex(obj); + break; } } @@ -660,6 +730,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, void __user *argp = (void __user *)parm; switch (cmd) { + case WINESYNC_IOC_CREATE_MUTEX: + return winesync_create_mutex(dev, argp); case WINESYNC_IOC_CREATE_SEM: return winesync_create_sem(dev, argp); case WINESYNC_IOC_DELETE: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 44025a510cb99b..23606a3b1546ae 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -16,6 +16,12 @@ struct winesync_sem_args { __u32 max; }; +struct winesync_mutex_args { + __u32 mutex; + __u32 owner; + __u32 count; +}; + struct winesync_wait_args { __u64 timeout; __u64 objs; @@ -36,5 +42,7 @@ struct winesync_wait_args { struct winesync_wait_args) #define WINESYNC_IOC_WAIT_ALL _IOWR(WINESYNC_IOC_BASE, 4, \ struct winesync_wait_args) +#define WINESYNC_IOC_CREATE_MUTEX _IOWR(WINESYNC_IOC_BASE, 5, \ + struct winesync_mutex_args) #endif From 54ffc1e526fac9435a137c40aea6adba9bdf05c3 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:44:41 -0600 Subject: [PATCH 08/57] winesync: Introduce WINESYNC_IOC_PUT_MUTEX. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 67 +++++++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 2 ++ 2 files changed, 69 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 18eb0597590732..d18d08a6854663 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -450,6 +450,71 @@ static int winesync_put_sem(struct winesync_device *dev, void __user *argp) return ret; } +/* + * Actually change the mutex state, returning -EPERM if not the owner. + */ +static int put_mutex_state(struct winesync_obj *mutex, + const struct winesync_mutex_args *args) +{ + lockdep_assert_held(&mutex->lock); + + if (mutex->u.mutex.owner != args->owner) + return -EPERM; + + if (!--mutex->u.mutex.count) + mutex->u.mutex.owner = 0; + return 0; +} + +static int winesync_put_mutex(struct winesync_device *dev, void __user *argp) +{ + struct winesync_mutex_args __user *user_args = argp; + struct winesync_mutex_args args; + struct winesync_obj *mutex; + __u32 prev_count; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + if (!args.owner) + return -EINVAL; + + mutex = get_obj_typed(dev, args.mutex, WINESYNC_TYPE_MUTEX); + if (!mutex) + return -EINVAL; + + if (atomic_read(&mutex->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&mutex->lock); + + prev_count = mutex->u.mutex.count; + ret = put_mutex_state(mutex, &args); + if (!ret) { + try_wake_all_obj(dev, mutex); + try_wake_any_mutex(mutex); + } + + spin_unlock(&mutex->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&mutex->lock); + + prev_count = mutex->u.mutex.count; + ret = put_mutex_state(mutex, &args); + if (!ret) + try_wake_any_mutex(mutex); + + spin_unlock(&mutex->lock); + } + + put_obj(mutex); + + if (!ret && put_user(prev_count, &user_args->count)) + ret = -EFAULT; + + return ret; +} + static int winesync_schedule(const struct winesync_q *q, ktime_t *timeout) { int ret = 0; @@ -736,6 +801,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_create_sem(dev, argp); case WINESYNC_IOC_DELETE: return winesync_delete(dev, argp); + case WINESYNC_IOC_PUT_MUTEX: + return winesync_put_mutex(dev, argp); case WINESYNC_IOC_PUT_SEM: return winesync_put_sem(dev, argp); case WINESYNC_IOC_WAIT_ALL: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 23606a3b1546ae..fde08cb8ab959d 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -44,5 +44,7 @@ struct winesync_wait_args { struct winesync_wait_args) #define WINESYNC_IOC_CREATE_MUTEX _IOWR(WINESYNC_IOC_BASE, 5, \ struct winesync_mutex_args) +#define WINESYNC_IOC_PUT_MUTEX _IOWR(WINESYNC_IOC_BASE, 6, \ + struct winesync_mutex_args) #endif From bfc193d3ab42f36c8bef5fa5811e3e1c227fe19b Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:46:46 -0600 Subject: [PATCH 09/57] winesync: Introduce WINESYNC_IOC_KILL_OWNER. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 80 ++++++++++++++++++++++++++++++++++- include/uapi/linux/winesync.h | 1 + 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index d18d08a6854663..891537063bb658 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -64,6 +64,7 @@ struct winesync_obj { struct { __u32 count; __u32 owner; + bool ownerdead; } mutex; } u; }; @@ -87,6 +88,7 @@ struct winesync_q { atomic_t signaled; bool all; + bool ownerdead; __u32 count; struct winesync_q_entry entries[]; }; @@ -240,6 +242,9 @@ static void try_wake_all(struct winesync_device *dev, struct winesync_q *q, obj->u.sem.count--; break; case WINESYNC_TYPE_MUTEX: + if (obj->u.mutex.ownerdead) + q->ownerdead = true; + obj->u.mutex.ownerdead = false; obj->u.mutex.count++; obj->u.mutex.owner = q->owner; break; @@ -300,6 +305,9 @@ static void try_wake_any_mutex(struct winesync_obj *mutex) continue; if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) { + if (mutex->u.mutex.ownerdead) + q->ownerdead = true; + mutex->u.mutex.ownerdead = false; mutex->u.mutex.count++; mutex->u.mutex.owner = q->owner; wake_up_process(q->task); @@ -515,6 +523,71 @@ static int winesync_put_mutex(struct winesync_device *dev, void __user *argp) return ret; } +/* + * Actually change the mutex state to mark its owner as dead. + */ +static void put_mutex_ownerdead_state(struct winesync_obj *mutex) +{ + lockdep_assert_held(&mutex->lock); + + mutex->u.mutex.ownerdead = true; + mutex->u.mutex.owner = 0; + mutex->u.mutex.count = 0; +} + +static int winesync_kill_owner(struct winesync_device *dev, void __user *argp) +{ + struct winesync_obj *obj; + unsigned long id; + __u32 owner; + + if (get_user(owner, (__u32 __user *)argp)) + return -EFAULT; + if (!owner) + return -EINVAL; + + rcu_read_lock(); + + xa_for_each(&dev->objects, id, obj) { + if (!kref_get_unless_zero(&obj->refcount)) + continue; + + if (obj->type != WINESYNC_TYPE_MUTEX) { + put_obj(obj); + continue; + } + + if (atomic_read(&obj->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&obj->lock); + + if (obj->u.mutex.owner == owner) { + put_mutex_ownerdead_state(obj); + try_wake_all_obj(dev, obj); + try_wake_any_mutex(obj); + } + + spin_unlock(&obj->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&obj->lock); + + if (obj->u.mutex.owner == owner) { + put_mutex_ownerdead_state(obj); + try_wake_any_mutex(obj); + } + + spin_unlock(&obj->lock); + } + + put_obj(obj); + } + + rcu_read_unlock(); + + return 0; +} + static int winesync_schedule(const struct winesync_q *q, ktime_t *timeout) { int ret = 0; @@ -583,6 +656,7 @@ static int setup_wait(struct winesync_device *dev, q->owner = args->owner; atomic_set(&q->signaled, -1); q->all = all; + q->ownerdead = false; q->count = count; for (i = 0; i < count; i++) { @@ -695,7 +769,7 @@ static int winesync_wait_any(struct winesync_device *dev, void __user *argp) struct winesync_wait_args __user *user_args = argp; /* even if we caught a signal, we need to communicate success */ - ret = 0; + ret = q->ownerdead ? -EOWNERDEAD : 0; if (put_user(signaled, &user_args->index)) ret = -EFAULT; @@ -776,7 +850,7 @@ static int winesync_wait_all(struct winesync_device *dev, void __user *argp) struct winesync_wait_args __user *user_args = argp; /* even if we caught a signal, we need to communicate success */ - ret = 0; + ret = q->ownerdead ? -EOWNERDEAD : 0; if (put_user(signaled, &user_args->index)) ret = -EFAULT; @@ -801,6 +875,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_create_sem(dev, argp); case WINESYNC_IOC_DELETE: return winesync_delete(dev, argp); + case WINESYNC_IOC_KILL_OWNER: + return winesync_kill_owner(dev, argp); case WINESYNC_IOC_PUT_MUTEX: return winesync_put_mutex(dev, argp); case WINESYNC_IOC_PUT_SEM: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index fde08cb8ab959d..f57aa76d57f54d 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -46,5 +46,6 @@ struct winesync_wait_args { struct winesync_mutex_args) #define WINESYNC_IOC_PUT_MUTEX _IOWR(WINESYNC_IOC_BASE, 6, \ struct winesync_mutex_args) +#define WINESYNC_IOC_KILL_OWNER _IOW (WINESYNC_IOC_BASE, 7, __u32) #endif From 545dc8c27d8170c92e8df40e304938696ff19dad Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:47:55 -0600 Subject: [PATCH 10/57] winesync: Introduce WINESYNC_IOC_READ_SEM. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 29 +++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 2 ++ 2 files changed, 31 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 891537063bb658..98bedda2f8eb9c 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -523,6 +523,33 @@ static int winesync_put_mutex(struct winesync_device *dev, void __user *argp) return ret; } +static int winesync_read_sem(struct winesync_device *dev, void __user *argp) +{ + struct winesync_sem_args __user *user_args = argp; + struct winesync_sem_args args; + struct winesync_obj *sem; + __u32 id; + + if (get_user(id, &user_args->sem)) + return -EFAULT; + + sem = get_obj_typed(dev, id, WINESYNC_TYPE_SEM); + if (!sem) + return -EINVAL; + + args.sem = id; + spin_lock(&sem->lock); + args.count = sem->u.sem.count; + args.max = sem->u.sem.max; + spin_unlock(&sem->lock); + + put_obj(sem); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return 0; +} + /* * Actually change the mutex state to mark its owner as dead. */ @@ -881,6 +908,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_put_mutex(dev, argp); case WINESYNC_IOC_PUT_SEM: return winesync_put_sem(dev, argp); + case WINESYNC_IOC_READ_SEM: + return winesync_read_sem(dev, argp); case WINESYNC_IOC_WAIT_ALL: return winesync_wait_all(dev, argp); case WINESYNC_IOC_WAIT_ANY: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index f57aa76d57f54d..311eb810647d22 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -47,5 +47,7 @@ struct winesync_wait_args { #define WINESYNC_IOC_PUT_MUTEX _IOWR(WINESYNC_IOC_BASE, 6, \ struct winesync_mutex_args) #define WINESYNC_IOC_KILL_OWNER _IOW (WINESYNC_IOC_BASE, 7, __u32) +#define WINESYNC_IOC_READ_SEM _IOWR(WINESYNC_IOC_BASE, 8, \ + struct winesync_sem_args) #endif From 90a5f7e7f3a18cd3b76208ed911c9bb83c8c387a Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:48:10 -0600 Subject: [PATCH 11/57] winesync: Introduce WINESYNC_IOC_READ_MUTEX. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 31 +++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 2 ++ 2 files changed, 33 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 98bedda2f8eb9c..eae272663abe7c 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -550,6 +550,35 @@ static int winesync_read_sem(struct winesync_device *dev, void __user *argp) return 0; } +static int winesync_read_mutex(struct winesync_device *dev, void __user *argp) +{ + struct winesync_mutex_args __user *user_args = argp; + struct winesync_mutex_args args; + struct winesync_obj *mutex; + __u32 id; + int ret; + + if (get_user(id, &user_args->mutex)) + return -EFAULT; + + mutex = get_obj_typed(dev, id, WINESYNC_TYPE_MUTEX); + if (!mutex) + return -EINVAL; + + args.mutex = id; + spin_lock(&mutex->lock); + args.count = mutex->u.mutex.count; + args.owner = mutex->u.mutex.owner; + ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0; + spin_unlock(&mutex->lock); + + put_obj(mutex); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return ret; +} + /* * Actually change the mutex state to mark its owner as dead. */ @@ -908,6 +937,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_put_mutex(dev, argp); case WINESYNC_IOC_PUT_SEM: return winesync_put_sem(dev, argp); + case WINESYNC_IOC_READ_MUTEX: + return winesync_read_mutex(dev, argp); case WINESYNC_IOC_READ_SEM: return winesync_read_sem(dev, argp); case WINESYNC_IOC_WAIT_ALL: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 311eb810647d22..3371a303a9270a 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -49,5 +49,7 @@ struct winesync_wait_args { #define WINESYNC_IOC_KILL_OWNER _IOW (WINESYNC_IOC_BASE, 7, __u32) #define WINESYNC_IOC_READ_SEM _IOWR(WINESYNC_IOC_BASE, 8, \ struct winesync_sem_args) +#define WINESYNC_IOC_READ_MUTEX _IOWR(WINESYNC_IOC_BASE, 9, \ + struct winesync_mutex_args) #endif From 095281e35cdd27278775832a691d529419070fba Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 11:50:49 -0600 Subject: [PATCH 12/57] docs: winesync: Add documentation for the winesync uAPI. Signed-off-by: Kai Krakow --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/winesync.rst | 324 +++++++++++++++++++++++ 2 files changed, 325 insertions(+) create mode 100644 Documentation/userspace-api/winesync.rst diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 72a65db0c49889..ce5d0df572ebd1 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -32,6 +32,7 @@ place where this information is gathered. sysfs-platform_profile vduse futex2 + winesync .. only:: subproject and html diff --git a/Documentation/userspace-api/winesync.rst b/Documentation/userspace-api/winesync.rst new file mode 100644 index 00000000000000..34e54be229cfc2 --- /dev/null +++ b/Documentation/userspace-api/winesync.rst @@ -0,0 +1,324 @@ +===================================== +Wine synchronization primitive driver +===================================== + +This page documents the user-space API for the winesync driver. + +winesync is a support driver for emulation of NT synchronization +primitives by the Wine project or other NT emulators. It exists +because implementation in user-space, using existing tools, cannot +simultaneously satisfy performance, correctness, and security +constraints. It is implemented entirely in software, and does not +drive any hardware device. + +This interface is meant as a compatibility tool only, and should not +be used for general synchronization. Instead use generic, versatile +interfaces such as futex(2) and poll(2). + +Synchronization primitives +========================== + +The winesync driver exposes two types of synchronization primitives, +semaphores and mutexes. + +A semaphore holds a single volatile 32-bit counter, and a static +32-bit integer denoting the maximum value. It is considered signaled +when the counter is nonzero. The counter is decremented by one when a +wait is satisfied. Both the initial and maximum count are established +when the semaphore is created. + +A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit +identifier denoting its owner. A mutex is considered signaled when its +owner is zero (indicating that it is not owned). The recursion count +is incremented when a wait is satisfied, and ownership is set to the +given identifier. + +A mutex also holds an internal flag denoting whether its previous +owner has died; such a mutex is said to be inconsistent. Owner death +is not tracked automatically based on thread death, but rather must be +communicated using ``WINESYNC_IOC_KILL_OWNER``. An inconsistent mutex +is inherently considered unowned. + +Except for the "unowned" semantics of zero, the actual value of the +owner identifier is not interpreted by the winesync driver at all. The +intended use is to store a thread identifier; however, the winesync +driver does not actually validate that a calling thread provides +consistent or unique identifiers. + +Unless specified otherwise, all operations on an object are atomic and +totally ordered with respect to other operations on the same object. + +Objects are represented by unsigned 32-bit integers. + +Char device +=========== + +The winesync driver creates a single char device /dev/winesync. Each +file description opened on the device represents a unique namespace. +That is, objects created on one open file description are shared +across all its individual descriptors, but are not shared with other +open() calls on the same device. The same file description may be +shared across multiple processes. + +ioctl reference +=============== + +All operations on the device are done through ioctls. There are three +structures used in ioctl calls:: + + struct winesync_sem_args { + __u32 sem; + __u32 count; + __u32 max; + }; + + struct winesync_mutex_args { + __u32 mutex; + __u32 owner; + __u32 count; + }; + + struct winesync_wait_args { + __u64 timeout; + __u64 objs; + __u32 count; + __u32 owner; + __u32 index; + __u32 pad; + }; + +Depending on the ioctl, members of the structure may be used as input, +output, or not at all. All ioctls return 0 on success. + +The ioctls are as follows: + +.. c:macro:: WINESYNC_IOC_CREATE_SEM + + Create a semaphore object. Takes a pointer to struct + :c:type:`winesync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``sem`` + - On output, contains the identifier of the created semaphore. + * - ``count`` + - Initial count of the semaphore. + * - ``max`` + - Maximum count of the semaphore. + + Fails with ``EINVAL`` if ``count`` is greater than ``max``. + +.. c:macro:: WINESYNC_IOC_CREATE_MUTEX + + Create a mutex object. Takes a pointer to struct + :c:type:`winesync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``mutex`` + - On output, contains the identifier of the created mutex. + * - ``count`` + - Initial recursion count of the mutex. + * - ``owner`` + - Initial owner of the mutex. + + If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is + zero and ``count`` is nonzero, the function fails with ``EINVAL``. + +.. c:macro:: WINESYNC_IOC_DELETE + + Delete an object of any type. Takes an input-only pointer to a + 32-bit integer denoting the object to delete. + + Wait ioctls currently in progress are not interrupted, and behave as + if the object remains valid. + +.. c:macro:: WINESYNC_IOC_PUT_SEM + + Post to a semaphore object. Takes a pointer to struct + :c:type:`winesync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``sem`` + - Semaphore object to post to. + * - ``count`` + - Count to add to the semaphore. On output, contains the + previous count of the semaphore. + * - ``max`` + - Not used. + + If adding ``count`` to the semaphore's current count would raise the + latter past the semaphore's maximum count, the ioctl fails with + ``EOVERFLOW`` and the semaphore is not affected. If raising the + semaphore's count causes it to become signaled, eligible threads + waiting on this semaphore will be woken and the semaphore's count + decremented appropriately. + +.. c:macro:: WINESYNC_IOC_PUT_MUTEX + + Release a mutex object. Takes a pointer to struct + :c:type:`winesync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``mutex`` + - Mutex object to release. + * - ``owner`` + - Mutex owner identifier. + * - ``count`` + - On output, contains the previous recursion count. + + If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner`` + is not the current owner of the mutex, the ioctl fails with + ``EPERM``. + + The mutex's count will be decremented by one. If decrementing the + mutex's count causes it to become zero, the mutex is marked as + unowned and signaled, and eligible threads waiting on it will be + woken as appropriate. + +.. c:macro:: WINESYNC_IOC_READ_SEM + + Read the current state of a semaphore object. Takes a pointer to + struct :c:type:`winesync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``sem`` + - Semaphore object to read. + * - ``count`` + - On output, contains the current count of the semaphore. + * - ``max`` + - On output, contains the maximum count of the semaphore. + +.. c:macro:: WINESYNC_IOC_READ_MUTEX + + Read the current state of a mutex object. Takes a pointer to struct + :c:type:`winesync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``mutex`` + - Mutex object to read. + * - ``owner`` + - On output, contains the current owner of the mutex, or zero + if the mutex is not currently owned. + * - ``count`` + - On output, contains the current recursion count of the mutex. + + If the mutex is marked as inconsistent, the function fails with + ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to + zero. + +.. c:macro:: WINESYNC_IOC_KILL_OWNER + + Mark any mutexes owned by the given owner as unowned and + inconsistent. Takes an input-only pointer to a 32-bit integer + denoting the owner. If the owner is zero, the ioctl fails with + ``EINVAL``. + + For each mutex currently owned by the given owner, eligible threads + waiting on said mutex will be woken as appropriate (and such waits + will fail with ``EOWNERDEAD``, as described below). + + The operation as a whole is not atomic; however, the modification of + each mutex is atomic and totally ordered with respect to other + operations on the same mutex. + +.. c:macro:: WINESYNC_IOC_WAIT_ANY + + Poll on any of a list of objects, atomically acquiring at most one. + Takes a pointer to struct :c:type:`winesync_wait_args`, which is + used as follows: + + .. list-table:: + + * - ``timeout`` + - Optional pointer to a 64-bit struct :c:type:`timespec` + (specified as an integer so that the structure has the same + size regardless of architecture). The timeout is specified in + absolute format, as measured against the MONOTONIC clock. If + the timeout is equal to or earlier than the current time, the + function returns immediately without sleeping. If ``timeout`` + is zero, i.e. NULL, the function will sleep until an object + is signaled, and will not fail with ``ETIMEDOUT``. + * - ``objs`` + - Pointer to an array of ``count`` 32-bit object identifiers + (specified as an integer so that the structure has the same + size regardless of architecture). If any identifier is + invalid, the function fails with ``EINVAL``. + * - ``count`` + - Number of object identifiers specified in the ``objs`` array. + * - ``owner`` + - Mutex owner identifier. If any object in ``objs`` is a mutex, + the ioctl will attempt to acquire that mutex on behalf of + ``owner``. If ``owner`` is zero, the ioctl fails with + ``EINVAL``. + * - ``index`` + - On success, contains the index (into ``objs``) of the object + which was signaled. + * - ``pad`` + - This field is not used and must be set to zero. + + This function attempts to acquire one of the given objects. If + unable to do so, it sleeps until an object becomes signaled, + subsequently acquiring it, or the timeout expires. In the latter + case the ioctl fails with ``ETIMEDOUT``. The function only acquires + one object, even if multiple objects are signaled. + + A semaphore is considered to be signaled if its count is nonzero, + and is acquired by decrementing its count by one. A mutex is + considered to be signaled if it is unowned or if its owner matches + the ``owner`` argument, and is acquired by incrementing its + recursion count by one and setting its owner to the ``owner`` + argument. + + Acquisition is atomic and totally ordered with respect to other + operations on the same object. If two wait operations (with + different ``owner`` identifiers) are queued on the same mutex, only + one is signaled. If two wait operations are queued on the same + semaphore, and a value of one is posted to it, only one is signaled. + The order in which threads are signaled is not specified. + + If an inconsistent mutex is acquired, the ioctl fails with + ``EOWNERDEAD``. Although this is a failure return, the function may + otherwise be considered successful. The mutex is marked as owned by + the given owner (with a recursion count of 1) and as no longer + inconsistent, and ``index`` is still set to the index of the mutex. + + It is valid to pass the same object more than once. If a wakeup + occurs due to that object being signaled, ``index`` is set to the + lowest index corresponding to that object. + + The function may fail with ``EINTR`` if a signal is received. + +.. c:macro:: WINESYNC_IOC_WAIT_ALL + + Poll on a list of objects, atomically acquiring all of them. Takes a + pointer to struct :c:type:`winesync_wait_args`, which is used + identically to ``WINESYNC_IOC_WAIT_ANY``, except that ``index`` is + always filled with zero on success. + + This function attempts to simultaneously acquire all of the given + objects. If unable to do so, it sleeps until all objects become + simultaneously signaled, subsequently acquiring them, or the timeout + expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and + no objects are modified. + + Objects may become signaled and subsequently designaled (through + acquisition by other threads) while this thread is sleeping. Only + once all objects are simultaneously signaled does the ioctl acquire + them and return. The entire acquisition is atomic and totally + ordered with respect to other operations on any of the given + objects. + + If an inconsistent mutex is acquired, the ioctl fails with + ``EOWNERDEAD``. Similarly to ``WINESYNC_IOC_WAIT_ANY``, all objects + are nevertheless marked as acquired. Note that if multiple mutex + objects are specified, there is no way to know which were marked as + inconsistent. + + Unlike ``WINESYNC_IOC_WAIT_ANY``, it is not valid to pass the same + object more than once. If this is attempted, the function fails with + ``EINVAL``. From 2a262c5185d08ce10e3e6d9c21974855f6f2f692 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:06:23 -0600 Subject: [PATCH 13/57] selftests: winesync: Add some tests for semaphore state. Signed-off-by: Kai Krakow --- tools/testing/selftests/Makefile | 1 + .../selftests/drivers/winesync/Makefile | 8 + .../testing/selftests/drivers/winesync/config | 1 + .../selftests/drivers/winesync/winesync.c | 153 ++++++++++++++++++ 4 files changed, 163 insertions(+) create mode 100644 tools/testing/selftests/drivers/winesync/Makefile create mode 100644 tools/testing/selftests/drivers/winesync/config create mode 100644 tools/testing/selftests/drivers/winesync/winesync.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 697f13bbbc3217..c594eb398fdf3b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -18,6 +18,7 @@ TARGETS += drivers/dma-buf TARGETS += drivers/s390x/uvdevice TARGETS += drivers/net/bonding TARGETS += drivers/net/team +TARGETS += drivers/winesync TARGETS += efivarfs TARGETS += exec TARGETS += fchmodat2 diff --git a/tools/testing/selftests/drivers/winesync/Makefile b/tools/testing/selftests/drivers/winesync/Makefile new file mode 100644 index 00000000000000..43b39fdeea10ec --- /dev/null +++ b/tools/testing/selftests/drivers/winesync/Makefile @@ -0,0 +1,8 @@ +# SPDX-LICENSE-IDENTIFIER: GPL-2.0-only +TEST_GEN_PROGS := winesync + +top_srcdir =../../../../.. +CFLAGS += -I$(top_srcdir)/usr/include +LDLIBS += -lpthread + +include ../../lib.mk diff --git a/tools/testing/selftests/drivers/winesync/config b/tools/testing/selftests/drivers/winesync/config new file mode 100644 index 00000000000000..60539c826d0624 --- /dev/null +++ b/tools/testing/selftests/drivers/winesync/config @@ -0,0 +1 @@ +CONFIG_WINESYNC=y diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c new file mode 100644 index 00000000000000..58ade297fef90d --- /dev/null +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Various unit tests for the "winesync" synchronization primitive driver. + * + * Copyright (C) 2021 Zebediah Figura + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +static int read_sem_state(int fd, __u32 sem, __u32 *count, __u32 *max) +{ + struct winesync_sem_args args; + int ret; + + args.sem = sem; + args.count = 0xdeadbeef; + args.max = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_SEM, &args); + *count = args.count; + *max = args.max; + return ret; +} + +#define check_sem_state(fd, sem, count, max) \ + ({ \ + __u32 __count, __max; \ + int ret = read_sem_state((fd), (sem), &__count, &__max); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((count), __count); \ + EXPECT_EQ((max), __max); \ + }) + +static int put_sem(int fd, __u32 sem, __u32 *count) +{ + struct winesync_sem_args args; + int ret; + + args.sem = sem; + args.count = *count; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &args); + *count = args.count; + return ret; +} + +static int wait_any(int fd, __u32 count, const __u32 *objs, __u32 owner, + __u32 *index) +{ + struct winesync_wait_args args = {0}; + struct timespec timeout; + int ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + args.timeout = (uintptr_t)&timeout; + args.count = count; + args.objs = (uintptr_t)objs; + args.owner = owner; + args.index = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &args); + *index = args.index; + return ret; +} + +TEST(semaphore_state) +{ + struct winesync_sem_args sem_args; + struct timespec timeout; + __u32 sem, count, index; + int fd, ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 3; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + sem_args.count = 2; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + check_sem_state(fd, sem, 2, 2); + + count = 0; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_sem_state(fd, sem, 2, 2); + + count = 1; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(fd, sem, 2, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem, 1, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem, 0, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + count = 3; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(fd, sem, 0, 2); + + count = 2; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(fd, sem, 2, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + + count = 1; + ret = put_sem(fd, sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(fd, sem, 1, 2); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST_HARNESS_MAIN From 483bf8927a188176fa5f65bc574ffccbdf3ee460 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:07:04 -0600 Subject: [PATCH 14/57] selftests: winesync: Add some tests for mutex state. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 58ade297fef90d..801b776da5aa4f 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -49,6 +49,42 @@ static int put_sem(int fd, __u32 sem, __u32 *count) return ret; } +static int read_mutex_state(int fd, __u32 mutex, __u32 *count, __u32 *owner) +{ + struct winesync_mutex_args args; + int ret; + + args.mutex = mutex; + args.count = 0xdeadbeef; + args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &args); + *count = args.count; + *owner = args.owner; + return ret; +} + +#define check_mutex_state(fd, mutex, count, owner) \ + ({ \ + __u32 __count, __owner; \ + int ret = read_mutex_state((fd), (mutex), &__count, &__owner); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((count), __count); \ + EXPECT_EQ((owner), __owner); \ + }) + +static int put_mutex(int fd, __u32 mutex, __u32 owner, __u32 *count) +{ + struct winesync_mutex_args args; + int ret; + + args.mutex = mutex; + args.owner = owner; + args.count = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_PUT_MUTEX, &args); + *count = args.count; + return ret; +} + static int wait_any(int fd, __u32 count, const __u32 *objs, __u32 owner, __u32 *index) { @@ -150,4 +186,156 @@ TEST(semaphore_state) close(fd); } +TEST(mutex_state) +{ + struct winesync_mutex_args mutex_args; + __u32 mutex, owner, count, index; + struct timespec timeout; + int fd, ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + mutex_args.owner = 123; + mutex_args.count = 0; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 0; + mutex_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 123; + mutex_args.count = 2; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + mutex = mutex_args.mutex; + check_mutex_state(fd, mutex, 2, 123); + + ret = put_mutex(fd, mutex, 0, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = put_mutex(fd, mutex, 456, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + check_mutex_state(fd, mutex, 2, 123); + + ret = put_mutex(fd, mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_mutex_state(fd, mutex, 1, 123); + + ret = put_mutex(fd, mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, count); + check_mutex_state(fd, mutex, 0, 0); + + ret = put_mutex(fd, mutex, 123, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + + ret = wait_any(fd, 1, &mutex, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 456); + + ret = wait_any(fd, 1, &mutex, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 2, 456); + + ret = put_mutex(fd, mutex, 456, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_mutex_state(fd, mutex, 1, 456); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + owner = 0; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + check_mutex_state(fd, mutex, 1, 456); + + owner = 456; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + mutex_args.count = 0xdeadbeef; + mutex_args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + mutex_args.count = 0xdeadbeef; + mutex_args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 123); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + mutex_args.count = 0xdeadbeef; + mutex_args.owner = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 123); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex); + EXPECT_EQ(0, ret); + + mutex_args.owner = 0; + mutex_args.count = 0; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + mutex = mutex_args.mutex; + check_mutex_state(fd, mutex, 0, 0); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(fd, mutex, 1, 123); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + close(fd); +} + TEST_HARNESS_MAIN From a99c4279ba56679b14005e0cd9391df9b6e9a232 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:07:45 -0600 Subject: [PATCH 15/57] selftests: winesync: Add some tests for WINESYNC_IOC_WAIT_ANY. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 801b776da5aa4f..5903061d38b6b5 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -338,4 +338,111 @@ TEST(mutex_state) close(fd); } +TEST(test_wait_any) +{ + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2], owner, index; + struct timespec timeout; + int fd, ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 0; + mutex_args.count = 0; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 2, 123); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(1, index); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + + /* test waiting on the same object twice */ + sem_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + objs[0] = objs[1] = sem_args.sem; + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, wait_args.index); + check_sem_state(fd, sem_args.sem, 1, 3); + + ret = wait_any(fd, 0, NULL, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + close(fd); +} + TEST_HARNESS_MAIN From 3cf8acf6ed726ccd779757e70320d0d10688bf46 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:08:25 -0600 Subject: [PATCH 16/57] selftests: winesync: Add some tests for WINESYNC_IOC_WAIT_ALL. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 104 +++++++++++++++++- 1 file changed, 101 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 5903061d38b6b5..0718219f54bf34 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -85,8 +85,8 @@ static int put_mutex(int fd, __u32 mutex, __u32 owner, __u32 *count) return ret; } -static int wait_any(int fd, __u32 count, const __u32 *objs, __u32 owner, - __u32 *index) +static int wait_objs(int fd, unsigned long request, __u32 count, + const __u32 *objs, __u32 owner, __u32 *index) { struct winesync_wait_args args = {0}; struct timespec timeout; @@ -99,11 +99,23 @@ static int wait_any(int fd, __u32 count, const __u32 *objs, __u32 owner, args.objs = (uintptr_t)objs; args.owner = owner; args.index = 0xdeadbeef; - ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &args); + ret = ioctl(fd, request, &args); *index = args.index; return ret; } +static int wait_any(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ANY, count, objs, owner, index); +} + +static int wait_all(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ALL, count, objs, owner, index); +} + TEST(semaphore_state) { struct winesync_sem_args sem_args; @@ -445,4 +457,90 @@ TEST(test_wait_any) close(fd); } +TEST(test_wait_all) +{ + struct winesync_mutex_args mutex_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2], owner, index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 0; + mutex_args.count = 0; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + ret = wait_all(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 2, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + check_sem_state(fd, sem_args.sem, 0, 3); + check_mutex_state(fd, mutex_args.mutex, 2, 123); + + sem_args.count = 3; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 2, 3); + check_mutex_state(fd, mutex_args.mutex, 3, 123); + + owner = 123; + ret = ioctl(fd, WINESYNC_IOC_KILL_OWNER, &owner); + EXPECT_EQ(0, ret); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 123); + + /* test waiting on the same object twice */ + objs[0] = objs[1] = sem_args.sem; + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + close(fd); +} + TEST_HARNESS_MAIN From 2757d45110d5ceb1a55c11b2ee213fa4481d25ae Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:08:54 -0600 Subject: [PATCH 17/57] selftests: winesync: Add some tests for invalid object handling. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 0718219f54bf34..8a9fb496f5e04d 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -543,4 +543,97 @@ TEST(test_wait_all) close(fd); } +TEST(invalid_objects) +{ + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2] = {0}; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_PUT_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + wait_args.objs = (uintptr_t)objs; + wait_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + ret = ioctl(fd, WINESYNC_IOC_WAIT_ALL, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[0]); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + sem_args.max = 1; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + + mutex_args.mutex = sem_args.sem; + ret = ioctl(fd, WINESYNC_IOC_PUT_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_MUTEX, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + objs[0] = sem_args.sem; + objs[1] = sem_args.sem + 1; + wait_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + ret = ioctl(fd, WINESYNC_IOC_WAIT_ALL, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + objs[0] = sem_args.sem + 1; + objs[1] = sem_args.sem; + ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + ret = ioctl(fd, WINESYNC_IOC_WAIT_ALL, &wait_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + + sem_args.sem = mutex_args.mutex; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_SEM, &sem_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + close(fd); +} + TEST_HARNESS_MAIN From 75fb5a9b7a0881ca5d78c900e83c6e9ba36f1da3 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:09:32 -0600 Subject: [PATCH 18/57] selftests: winesync: Add some tests for wakeup signaling with WINESYNC_IOC_WAIT_ANY. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 8a9fb496f5e04d..04855df0089402 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -636,4 +636,158 @@ TEST(invalid_objects) close(fd); } +struct wake_args +{ + int fd; + __u32 obj; +}; + +struct wait_args +{ + int fd; + unsigned long request; + struct winesync_wait_args *args; + int ret; + int err; +}; + +static void *wait_thread(void *arg) +{ + struct wait_args *args = arg; + + args->ret = ioctl(args->fd, args->request, args->args); + args->err = errno; + return NULL; +} + +static void get_abs_timeout(struct timespec *timeout, clockid_t clock, + unsigned int ms) +{ + clock_gettime(clock, timeout); + timeout->tv_nsec += ms * 1000000; + timeout->tv_sec += (timeout->tv_nsec / 1000000000); + timeout->tv_nsec %= 1000000000; +} + +static int wait_for_thread(pthread_t thread, unsigned int ms) +{ + struct timespec timeout; + get_abs_timeout(&timeout, CLOCK_REALTIME, ms); + return pthread_timedjoin_np(thread, NULL, &timeout); +} + +TEST(wake_any) +{ + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + __u32 objs[2], count, index; + struct timespec timeout; + pthread_t thread; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 123; + mutex_args.count = 1; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + + /* test waking the semaphore */ + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 456; + wait_args.index = 0xdeadbeef; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ANY; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + check_sem_state(fd, sem_args.sem, 0, 3); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(0, wait_args.index); + + /* test waking the mutex */ + + /* first grab it again for owner 123 */ + ret = wait_any(fd, 1, &mutex_args.mutex, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.owner = 456; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = put_mutex(fd, mutex_args.mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + ret = put_mutex(fd, mutex_args.mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, mutex_args.count); + check_mutex_state(fd, mutex_args.mutex, 1, 456); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + /* delete an object while it's being waited on */ + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 200); + wait_args.owner = 123; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); + EXPECT_EQ(-1, thread_args.ret); + EXPECT_EQ(ETIMEDOUT, thread_args.err); + + close(fd); +} + TEST_HARNESS_MAIN From 383198a9a271179ae2e96c47d4cc08da3a04919c Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:09:36 -0600 Subject: [PATCH 19/57] selftests: winesync: Add some tests for wakeup signaling with WINESYNC_IOC_WAIT_ALL. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 04855df0089402..ad6d0f9a2a35b5 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -790,4 +790,106 @@ TEST(wake_any) close(fd); } +TEST(wake_all) +{ + struct winesync_mutex_args mutex_args = {0}; + struct winesync_wait_args wait_args = {0}; + struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + __u32 objs[2], count, index; + struct timespec timeout; + pthread_t thread; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 3; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + + mutex_args.owner = 123; + mutex_args.count = 1; + mutex_args.mutex = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, mutex_args.mutex); + + objs[0] = sem_args.sem; + objs[1] = mutex_args.mutex; + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 456; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ALL; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + check_sem_state(fd, sem_args.sem, 1, 3); + + ret = wait_any(fd, 1, &sem_args.sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = put_mutex(fd, mutex_args.mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + check_mutex_state(fd, mutex_args.mutex, 0, 0); + + sem_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, sem_args.count); + check_sem_state(fd, sem_args.sem, 1, 3); + check_mutex_state(fd, mutex_args.mutex, 1, 456); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + + /* delete an object while it's being waited on */ + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 200); + wait_args.owner = 123; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &sem_args.sem); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); + EXPECT_EQ(-1, thread_args.ret); + EXPECT_EQ(ETIMEDOUT, thread_args.err); + + close(fd); +} + TEST_HARNESS_MAIN From 4305105571e233f29b8216d2671959010ab3617d Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Fri, 5 Mar 2021 12:22:55 -0600 Subject: [PATCH 20/57] maintainers: Add an entry for winesync. Signed-off-by: Kai Krakow --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f09415b2b3c5cf..418ef9f6563b0e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23240,6 +23240,15 @@ M: David Härdeman S: Maintained F: drivers/media/rc/winbond-cir.c +WINESYNC SYNCHRONIZATION PRIMITIVE DRIVER +M: Zebediah Figura +L: wine-devel@winehq.org +S: Supported +F: Documentation/userspace-api/winesync.rst +F: drivers/misc/winesync.c +F: include/uapi/linux/winesync.h +F: tools/testing/selftests/drivers/winesync/ + WINSYSTEMS EBC-C384 WATCHDOG DRIVER L: linux-watchdog@vger.kernel.org S: Orphan From 20510e3801af3fdafc22f4e615ab44c9b6be9342 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 18:21:03 -0600 Subject: [PATCH 21/57] winesync: Introduce WINESYNC_IOC_CREATE_EVENT. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 65 +++++++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 8 +++++ 2 files changed, 73 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index eae272663abe7c..eaba41510784ec 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -17,6 +17,7 @@ enum winesync_type { WINESYNC_TYPE_SEM, WINESYNC_TYPE_MUTEX, + WINESYNC_TYPE_EVENT, }; struct winesync_obj { @@ -66,6 +67,10 @@ struct winesync_obj { __u32 owner; bool ownerdead; } mutex; + struct { + bool manual; + bool signaled; + } event; } u; }; @@ -199,6 +204,8 @@ static bool is_signaled(struct winesync_obj *obj, __u32 owner) if (obj->u.mutex.owner && obj->u.mutex.owner != owner) return false; return obj->u.mutex.count < UINT_MAX; + case WINESYNC_TYPE_EVENT: + return obj->u.event.signaled; } WARN(1, "bad object type %#x\n", obj->type); @@ -248,6 +255,10 @@ static void try_wake_all(struct winesync_device *dev, struct winesync_q *q, obj->u.mutex.count++; obj->u.mutex.owner = q->owner; break; + case WINESYNC_TYPE_EVENT: + if (!obj->u.event.manual) + obj->u.event.signaled = false; + break; } } wake_up_process(q->task); @@ -315,6 +326,26 @@ static void try_wake_any_mutex(struct winesync_obj *mutex) } } +static void try_wake_any_event(struct winesync_obj *event) +{ + struct winesync_q_entry *entry; + + lockdep_assert_held(&event->lock); + + list_for_each_entry(entry, &event->any_waiters, node) { + struct winesync_q *q = entry->q; + + if (!event->u.event.signaled) + break; + + if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) { + if (!event->u.event.manual) + event->u.event.signaled = false; + wake_up_process(q->task); + } + } +} + static int winesync_create_sem(struct winesync_device *dev, void __user *argp) { struct winesync_sem_args __user *user_args = argp; @@ -379,6 +410,35 @@ static int winesync_create_mutex(struct winesync_device *dev, void __user *argp) return put_user(id, &user_args->mutex); } +static int winesync_create_event(struct winesync_device *dev, void __user *argp) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + __u32 id; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + init_obj(event); + event->type = WINESYNC_TYPE_EVENT; + event->u.event.manual = args.manual; + event->u.event.signaled = args.signaled; + + ret = xa_alloc(&dev->objects, &id, event, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + kfree(event); + return ret; + } + + return put_user(id, &user_args->event); +} + static int winesync_delete(struct winesync_device *dev, void __user *argp) { struct winesync_obj *obj; @@ -760,6 +820,9 @@ static void try_wake_any_obj(struct winesync_obj *obj) case WINESYNC_TYPE_MUTEX: try_wake_any_mutex(obj); break; + case WINESYNC_TYPE_EVENT: + try_wake_any_event(obj); + break; } } @@ -925,6 +988,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, void __user *argp = (void __user *)parm; switch (cmd) { + case WINESYNC_IOC_CREATE_EVENT: + return winesync_create_event(dev, argp); case WINESYNC_IOC_CREATE_MUTEX: return winesync_create_mutex(dev, argp); case WINESYNC_IOC_CREATE_SEM: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 3371a303a9270a..3999407534e099 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -22,6 +22,12 @@ struct winesync_mutex_args { __u32 count; }; +struct winesync_event_args { + __u32 event; + __u32 manual; + __u32 signaled; +}; + struct winesync_wait_args { __u64 timeout; __u64 objs; @@ -51,5 +57,7 @@ struct winesync_wait_args { struct winesync_sem_args) #define WINESYNC_IOC_READ_MUTEX _IOWR(WINESYNC_IOC_BASE, 9, \ struct winesync_mutex_args) +#define WINESYNC_IOC_CREATE_EVENT _IOWR(WINESYNC_IOC_BASE, 10, \ + struct winesync_event_args) #endif From d75493d7ec027330a34ec797792ccd09d3ce34d3 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 18:43:30 -0600 Subject: [PATCH 22/57] winesync: Introduce WINESYNC_IOC_SET_EVENT. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 45 +++++++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 2 ++ 2 files changed, 47 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index eaba41510784ec..658ad7b80c291b 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -704,6 +704,49 @@ static int winesync_kill_owner(struct winesync_device *dev, void __user *argp) return 0; } +static int winesync_set_event(struct winesync_device *dev, void __user *argp) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + bool prev_state; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + event = get_obj_typed(dev, args.event, WINESYNC_TYPE_EVENT); + if (!event) + return -EINVAL; + + if (atomic_read(&event->all_hint) > 0) { + spin_lock(&dev->wait_all_lock); + spin_lock(&event->lock); + + prev_state = event->u.event.signaled; + event->u.event.signaled = true; + try_wake_all_obj(dev, event); + try_wake_any_event(event); + + spin_unlock(&event->lock); + spin_unlock(&dev->wait_all_lock); + } else { + spin_lock(&event->lock); + + prev_state = event->u.event.signaled; + event->u.event.signaled = true; + try_wake_any_event(event); + + spin_unlock(&event->lock); + } + + put_obj(event); + + if (put_user(prev_state, &user_args->signaled)) + return -EFAULT; + + return 0; +} + static int winesync_schedule(const struct winesync_q *q, ktime_t *timeout) { int ret = 0; @@ -1006,6 +1049,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_read_mutex(dev, argp); case WINESYNC_IOC_READ_SEM: return winesync_read_sem(dev, argp); + case WINESYNC_IOC_SET_EVENT: + return winesync_set_event(dev, argp); case WINESYNC_IOC_WAIT_ALL: return winesync_wait_all(dev, argp); case WINESYNC_IOC_WAIT_ANY: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 3999407534e099..34cd65d879a847 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -59,5 +59,7 @@ struct winesync_wait_args { struct winesync_mutex_args) #define WINESYNC_IOC_CREATE_EVENT _IOWR(WINESYNC_IOC_BASE, 10, \ struct winesync_event_args) +#define WINESYNC_IOC_SET_EVENT _IOWR(WINESYNC_IOC_BASE, 11, \ + struct winesync_event_args) #endif From 0c1be22f43d45e851ac5dc7359ac99ae7804295b Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 19:00:25 -0600 Subject: [PATCH 23/57] winesync: Introduce WINESYNC_IOC_RESET_EVENT. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 31 +++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 2 ++ 2 files changed, 33 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 658ad7b80c291b..a93f173127f44c 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -747,6 +747,35 @@ static int winesync_set_event(struct winesync_device *dev, void __user *argp) return 0; } +static int winesync_reset_event(struct winesync_device *dev, void __user *argp) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + bool prev_state; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + event = get_obj_typed(dev, args.event, WINESYNC_TYPE_EVENT); + if (!event) + return -EINVAL; + + spin_lock(&event->lock); + + prev_state = event->u.event.signaled; + event->u.event.signaled = false; + + spin_unlock(&event->lock); + + put_obj(event); + + if (put_user(prev_state, &user_args->signaled)) + return -EFAULT; + + return 0; +} + static int winesync_schedule(const struct winesync_q *q, ktime_t *timeout) { int ret = 0; @@ -1049,6 +1078,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_read_mutex(dev, argp); case WINESYNC_IOC_READ_SEM: return winesync_read_sem(dev, argp); + case WINESYNC_IOC_RESET_EVENT: + return winesync_reset_event(dev, argp); case WINESYNC_IOC_SET_EVENT: return winesync_set_event(dev, argp); case WINESYNC_IOC_WAIT_ALL: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 34cd65d879a847..e71271fc44ba87 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -61,5 +61,7 @@ struct winesync_wait_args { struct winesync_event_args) #define WINESYNC_IOC_SET_EVENT _IOWR(WINESYNC_IOC_BASE, 11, \ struct winesync_event_args) +#define WINESYNC_IOC_RESET_EVENT _IOWR(WINESYNC_IOC_BASE, 12, \ + struct winesync_event_args) #endif From c527bc754310d680d5c5fe82bc87505e25f68fd4 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 19:10:12 -0600 Subject: [PATCH 24/57] winesync: Introduce WINESYNC_IOC_PULSE_EVENT. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 11 +++++++++-- include/uapi/linux/winesync.h | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index a93f173127f44c..27d5baa457dfcc 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -704,7 +704,8 @@ static int winesync_kill_owner(struct winesync_device *dev, void __user *argp) return 0; } -static int winesync_set_event(struct winesync_device *dev, void __user *argp) +static int winesync_set_event(struct winesync_device *dev, void __user *argp, + bool pulse) { struct winesync_event_args __user *user_args = argp; struct winesync_event_args args; @@ -726,6 +727,8 @@ static int winesync_set_event(struct winesync_device *dev, void __user *argp) event->u.event.signaled = true; try_wake_all_obj(dev, event); try_wake_any_event(event); + if (pulse) + event->u.event.signaled = false; spin_unlock(&event->lock); spin_unlock(&dev->wait_all_lock); @@ -735,6 +738,8 @@ static int winesync_set_event(struct winesync_device *dev, void __user *argp) prev_state = event->u.event.signaled; event->u.event.signaled = true; try_wake_any_event(event); + if (pulse) + event->u.event.signaled = false; spin_unlock(&event->lock); } @@ -1070,6 +1075,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_delete(dev, argp); case WINESYNC_IOC_KILL_OWNER: return winesync_kill_owner(dev, argp); + case WINESYNC_IOC_PULSE_EVENT: + return winesync_set_event(dev, argp, true); case WINESYNC_IOC_PUT_MUTEX: return winesync_put_mutex(dev, argp); case WINESYNC_IOC_PUT_SEM: @@ -1081,7 +1088,7 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, case WINESYNC_IOC_RESET_EVENT: return winesync_reset_event(dev, argp); case WINESYNC_IOC_SET_EVENT: - return winesync_set_event(dev, argp); + return winesync_set_event(dev, argp, false); case WINESYNC_IOC_WAIT_ALL: return winesync_wait_all(dev, argp); case WINESYNC_IOC_WAIT_ANY: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index e71271fc44ba87..7c09d0e9733cdf 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -63,5 +63,7 @@ struct winesync_wait_args { struct winesync_event_args) #define WINESYNC_IOC_RESET_EVENT _IOWR(WINESYNC_IOC_BASE, 12, \ struct winesync_event_args) +#define WINESYNC_IOC_PULSE_EVENT _IOWR(WINESYNC_IOC_BASE, 13, \ + struct winesync_event_args) #endif From 729a7e547c8ddb97829e8489829b0c41a3588799 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 19:14:00 -0600 Subject: [PATCH 25/57] winesync: Introduce WINESYNC_IOC_READ_EVENT. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/winesync.h | 2 ++ 2 files changed, 32 insertions(+) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 27d5baa457dfcc..0f8a8a94eef833 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -639,6 +639,34 @@ static int winesync_read_mutex(struct winesync_device *dev, void __user *argp) return ret; } +static int winesync_read_event(struct winesync_device *dev, void __user *argp) +{ + struct winesync_event_args __user *user_args = argp; + struct winesync_event_args args; + struct winesync_obj *event; + __u32 id; + int ret; + + if (get_user(id, &user_args->event)) + return -EFAULT; + + event = get_obj_typed(dev, id, WINESYNC_TYPE_EVENT); + if (!event) + return -EINVAL; + + args.event = id; + spin_lock(&event->lock); + args.manual = event->u.event.manual; + args.signaled = event->u.event.signaled; + spin_unlock(&event->lock); + + put_obj(event); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return ret; +} + /* * Actually change the mutex state to mark its owner as dead. */ @@ -1081,6 +1109,8 @@ static long winesync_char_ioctl(struct file *file, unsigned int cmd, return winesync_put_mutex(dev, argp); case WINESYNC_IOC_PUT_SEM: return winesync_put_sem(dev, argp); + case WINESYNC_IOC_READ_EVENT: + return winesync_read_event(dev, argp); case WINESYNC_IOC_READ_MUTEX: return winesync_read_mutex(dev, argp); case WINESYNC_IOC_READ_SEM: diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index 7c09d0e9733cdf..fb3788339ffe9a 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -65,5 +65,7 @@ struct winesync_wait_args { struct winesync_event_args) #define WINESYNC_IOC_PULSE_EVENT _IOWR(WINESYNC_IOC_BASE, 13, \ struct winesync_event_args) +#define WINESYNC_IOC_READ_EVENT _IOWR(WINESYNC_IOC_BASE, 14, \ + struct winesync_event_args) #endif From c05d79b3cb8cc494637933476f59a64ed0a32a96 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 19:34:47 -0600 Subject: [PATCH 26/57] selftests: winesync: Add some tests for manual-reset event state. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index ad6d0f9a2a35b5..7e99f09b113b8d 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -85,6 +85,30 @@ static int put_mutex(int fd, __u32 mutex, __u32 owner, __u32 *count) return ret; } +static int read_event_state(int fd, __u32 event, __u32 *signaled, __u32 *manual) +{ + struct winesync_event_args args; + int ret; + + args.event = event; + args.signaled = 0xdeadbeef; + args.manual = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_READ_EVENT, &args); + *signaled = args.signaled; + *manual = args.manual; + return ret; +} + +#define check_event_state(fd, event, signaled, manual) \ + ({ \ + __u32 __signaled, __manual; \ + int ret = read_event_state((fd), (event), \ + &__signaled, &__manual); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((signaled), __signaled); \ + EXPECT_EQ((manual), __manual); \ + }) + static int wait_objs(int fd, unsigned long request, __u32 count, const __u32 *objs, __u32 owner, __u32 *index) { @@ -350,6 +374,74 @@ TEST(mutex_state) close(fd); } +TEST(manual_event_state) +{ + struct winesync_event_args event_args; + __u32 index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + event_args.manual = 1; + event_args.signaled = 0; + event_args.event = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, event_args.event); + check_event_state(fd, event_args.event, 0, 1); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 1, 1); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 1, 1); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_event_state(fd, event_args.event, 1, 1); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + close(fd); +} + TEST(test_wait_any) { struct winesync_mutex_args mutex_args = {0}; From 2909c613e579b1d1f5f14cfb7f3a2e1723c3f73a Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 19:45:39 -0600 Subject: [PATCH 27/57] selftests: winesync: Add some tests for auto-reset event state. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 7e99f09b113b8d..3a9ac69308afc0 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -442,6 +442,65 @@ TEST(manual_event_state) close(fd); } +TEST(auto_event_state) +{ + struct winesync_event_args event_args; + __u32 index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + event_args.manual = 0; + event_args.signaled = 1; + event_args.event = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, event_args.event); + + check_event_state(fd, event_args.event, 1, 0); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 1, 0); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_event_state(fd, event_args.event, 0, 0); + + event_args.signaled = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = wait_any(fd, 1, &event_args.event, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + close(fd); +} + TEST(test_wait_any) { struct winesync_mutex_args mutex_args = {0}; From 77068fdaeb5a74110ee2eb6bb826ce2774e15e6d Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 21:00:50 -0600 Subject: [PATCH 28/57] selftests: winesync: Add some tests for wakeup signaling with events. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 152 +++++++++++++++++- 1 file changed, 150 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 3a9ac69308afc0..2ccc51510230bf 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -610,6 +610,7 @@ TEST(test_wait_any) TEST(test_wait_all) { + struct winesync_event_args event_args = {0}; struct winesync_mutex_args mutex_args = {0}; struct winesync_sem_args sem_args = {0}; __u32 objs[2], owner, index; @@ -632,6 +633,11 @@ TEST(test_wait_all) EXPECT_EQ(0, ret); EXPECT_NE(0xdeadbeef, mutex_args.mutex); + event_args.manual = true; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + objs[0] = sem_args.sem; objs[1] = mutex_args.mutex; @@ -680,6 +686,14 @@ TEST(test_wait_all) check_sem_state(fd, sem_args.sem, 1, 3); check_mutex_state(fd, mutex_args.mutex, 1, 123); + objs[0] = sem_args.sem; + objs[1] = event_args.event; + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(fd, sem_args.sem, 0, 3); + check_event_state(fd, event_args.event, 1, 1); + /* test waiting on the same object twice */ objs[0] = objs[1] = sem_args.sem; ret = wait_all(fd, 2, objs, 123, &index); @@ -690,6 +704,8 @@ TEST(test_wait_all) EXPECT_EQ(0, ret); ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); close(fd); } @@ -829,6 +845,7 @@ static int wait_for_thread(pthread_t thread, unsigned int ms) TEST(wake_any) { + struct winesync_event_args event_args = {0}; struct winesync_mutex_args mutex_args = {0}; struct winesync_wait_args wait_args = {0}; struct winesync_sem_args sem_args = {0}; @@ -918,10 +935,103 @@ TEST(wake_any) EXPECT_EQ(0, thread_args.ret); EXPECT_EQ(1, wait_args.index); + /* test waking events */ + + event_args.manual = false; + event_args.signaled = false; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + objs[1] = event_args.event; + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + event_args.manual = true; + event_args.signaled = false; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + objs[1] = event_args.event; + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 1, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, event_args.signaled); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, event_args.signaled); + check_event_state(fd, event_args.event, 0, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + /* delete an object while it's being waited on */ get_abs_timeout(&timeout, CLOCK_MONOTONIC, 200); wait_args.owner = 123; + objs[1] = mutex_args.mutex; ret = pthread_create(&thread, NULL, wait_thread, &thread_args); EXPECT_EQ(0, ret); @@ -943,11 +1053,13 @@ TEST(wake_any) TEST(wake_all) { + struct winesync_event_args manual_event_args = {0}; + struct winesync_event_args auto_event_args = {0}; struct winesync_mutex_args mutex_args = {0}; struct winesync_wait_args wait_args = {0}; struct winesync_sem_args sem_args = {0}; struct wait_args thread_args; - __u32 objs[2], count, index; + __u32 objs[4], count, index; struct timespec timeout; pthread_t thread; int fd, ret; @@ -969,13 +1081,25 @@ TEST(wake_all) EXPECT_EQ(0, ret); EXPECT_NE(0xdeadbeef, mutex_args.mutex); + manual_event_args.manual = true; + manual_event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &manual_event_args); + EXPECT_EQ(0, ret); + + auto_event_args.manual = false; + auto_event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &auto_event_args); + EXPECT_EQ(0, ret); + objs[0] = sem_args.sem; objs[1] = mutex_args.mutex; + objs[2] = manual_event_args.event; + objs[3] = auto_event_args.event; get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); wait_args.timeout = (uintptr_t)&timeout; wait_args.objs = (uintptr_t)objs; - wait_args.count = 2; + wait_args.count = 4; wait_args.owner = 456; thread_args.fd = fd; thread_args.args = &wait_args; @@ -1009,12 +1133,32 @@ TEST(wake_all) check_mutex_state(fd, mutex_args.mutex, 0, 0); + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &manual_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, manual_event_args.signaled); + sem_args.count = 2; ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); EXPECT_EQ(0, ret); EXPECT_EQ(0, sem_args.count); + check_sem_state(fd, sem_args.sem, 2, 3); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &auto_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, auto_event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &manual_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, manual_event_args.signaled); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &auto_event_args); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, auto_event_args.signaled); + check_sem_state(fd, sem_args.sem, 1, 3); check_mutex_state(fd, mutex_args.mutex, 1, 456); + check_event_state(fd, manual_event_args.event, 1, 1); + check_event_state(fd, auto_event_args.event, 0, 0); ret = wait_for_thread(thread, 100); EXPECT_EQ(0, ret); @@ -1034,6 +1178,10 @@ TEST(wake_all) EXPECT_EQ(0, ret); ret = ioctl(fd, WINESYNC_IOC_DELETE, &mutex_args.mutex); EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &manual_event_args.event); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &auto_event_args.event); + EXPECT_EQ(0, ret); ret = wait_for_thread(thread, 200); EXPECT_EQ(0, ret); From 1769791f96e1e9ea9b174c21cc49592ac94092b3 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 21:06:22 -0600 Subject: [PATCH 29/57] selftests: winesync: Add some tests for invalid object handling with events. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index 2ccc51510230bf..f2e18836c73372 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -712,6 +712,7 @@ TEST(test_wait_all) TEST(invalid_objects) { + struct winesync_event_args event_args = {0}; struct winesync_mutex_args mutex_args = {0}; struct winesync_wait_args wait_args = {0}; struct winesync_sem_args sem_args = {0}; @@ -737,6 +738,22 @@ TEST(invalid_objects) EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + wait_args.objs = (uintptr_t)objs; wait_args.count = 1; ret = ioctl(fd, WINESYNC_IOC_WAIT_ANY, &wait_args); @@ -763,6 +780,23 @@ TEST(invalid_objects) EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); + event_args.event = sem_args.sem; + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_PULSE_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = ioctl(fd, WINESYNC_IOC_READ_EVENT, &event_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + objs[0] = sem_args.sem; objs[1] = sem_args.sem + 1; wait_args.count = 2; From 5faf1c9db2ac51161db93457f432710b86bdb251 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 19 Jan 2022 22:01:46 -0600 Subject: [PATCH 30/57] docs: winesync: Document event APIs. Signed-off-by: Kai Krakow --- Documentation/userspace-api/winesync.rst | 104 ++++++++++++++++++++++- 1 file changed, 101 insertions(+), 3 deletions(-) diff --git a/Documentation/userspace-api/winesync.rst b/Documentation/userspace-api/winesync.rst index 34e54be229cfc2..ffa2f8fbc7e3d0 100644 --- a/Documentation/userspace-api/winesync.rst +++ b/Documentation/userspace-api/winesync.rst @@ -18,8 +18,8 @@ interfaces such as futex(2) and poll(2). Synchronization primitives ========================== -The winesync driver exposes two types of synchronization primitives, -semaphores and mutexes. +The winesync driver exposes three types of synchronization primitives: +semaphores, mutexes, and events. A semaphore holds a single volatile 32-bit counter, and a static 32-bit integer denoting the maximum value. It is considered signaled @@ -45,6 +45,12 @@ intended use is to store a thread identifier; however, the winesync driver does not actually validate that a calling thread provides consistent or unique identifiers. +An event holds a volatile boolean state denoting whether it is +signaled or not. There are two types of events, auto-reset and +manual-reset. An auto-reset event is designaled when a wait is +satisfied; a manual-reset event is not. The event type is specified +when the event is created. + Unless specified otherwise, all operations on an object are atomic and totally ordered with respect to other operations on the same object. @@ -78,6 +84,12 @@ structures used in ioctl calls:: __u32 count; }; + struct winesync_event_args { + __u32 event; + __u32 signaled; + __u32 manual; + }; + struct winesync_wait_args { __u64 timeout; __u64 objs; @@ -125,6 +137,22 @@ The ioctls are as follows: If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is zero and ``count`` is nonzero, the function fails with ``EINVAL``. +.. c:macro:: WINESYNC_IOC_CREATE_EVENT + + Create an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - On output, contains the identifier of the created event. + * - ``signaled`` + - If nonzero, the event is initially signaled, otherwise + nonsignaled. + * - ``manual`` + - If nonzero, the event is a manual-reset event, otherwise + auto-reset. + .. c:macro:: WINESYNC_IOC_DELETE Delete an object of any type. Takes an input-only pointer to a @@ -178,6 +206,60 @@ The ioctls are as follows: unowned and signaled, and eligible threads waiting on it will be woken as appropriate. +.. c:macro:: WINESYNC_IOC_SET_EVENT + + Signal an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object to set. + * - ``signaled`` + - On output, contains the previous state of the event. + * - ``manual`` + - Unused. + + Eligible threads will be woken, and auto-reset events will be + designaled appropriately. + +.. c:macro:: WINESYNC_IOC_RESET_EVENT + + Designal an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object to reset. + * - ``signaled`` + - On output, contains the previous state of the event. + * - ``manual`` + - Unused. + +.. c:macro:: WINESYNC_IOC_PULSE_EVENT + + Wake threads waiting on an event object without leaving it in a + signaled state. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object to pulse. + * - ``signaled`` + - On output, contains the previous state of the event. + * - ``manual`` + - Unused. + + A pulse operation can be thought of as a set followed by a reset, + performed as a single atomic operation. If two threads are waiting + on an auto-reset event which is pulsed, only one will be woken. If + two threads are waiting a manual-reset event which is pulsed, both + will be woken. However, in both cases, the event will be unsignaled + afterwards, and a simultaneous read operation will always report the + event as unsignaled. + .. c:macro:: WINESYNC_IOC_READ_SEM Read the current state of a semaphore object. Takes a pointer to @@ -211,6 +293,21 @@ The ioctls are as follows: ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to zero. +.. c:macro:: WINESYNC_IOC_READ_EVENT + + Read the current state of an event object. Takes a pointer to struct + :c:type:`winesync_event_args`, which is used as follows: + + .. list-table:: + + * - ``event`` + - Event object. + * - ``signaled`` + - On output, contains the current state of the event. + * - ``manual`` + - On output, contains 1 if the event is a manual-reset event, + and 0 otherwise. + .. c:macro:: WINESYNC_IOC_KILL_OWNER Mark any mutexes owned by the given owner as unowned and @@ -272,7 +369,8 @@ The ioctls are as follows: considered to be signaled if it is unowned or if its owner matches the ``owner`` argument, and is acquired by incrementing its recursion count by one and setting its owner to the ``owner`` - argument. + argument. An auto-reset event is acquired by designaling it; a + manual-reset event is not affected by acquisition. Acquisition is atomic and totally ordered with respect to other operations on the same object. If two wait operations (with From 8986b60caac41500f27aafcf90f138ae485b10e0 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 13 Apr 2022 20:02:39 -0500 Subject: [PATCH 31/57] winesync: Introduce alertable waits. Signed-off-by: Kai Krakow --- drivers/misc/winesync.c | 68 ++++++++++++++++++++++++++++++----- include/uapi/linux/winesync.h | 2 +- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/drivers/misc/winesync.c b/drivers/misc/winesync.c index 0f8a8a94eef833..64b379d846dbee 100644 --- a/drivers/misc/winesync.c +++ b/drivers/misc/winesync.c @@ -842,10 +842,11 @@ static int setup_wait(struct winesync_device *dev, const __u32 count = args->count; struct winesync_q *q; ktime_t timeout = 0; + __u32 total_count; __u32 *ids; __u32 i, j; - if (!args->owner || args->pad) + if (!args->owner) return -EINVAL; if (args->timeout) { @@ -859,7 +860,11 @@ static int setup_wait(struct winesync_device *dev, timeout = timespec64_to_ns(&to); } - ids = kmalloc_array(count, sizeof(*ids), GFP_KERNEL); + total_count = count; + if (args->alert) + total_count++; + + ids = kmalloc_array(total_count, sizeof(*ids), GFP_KERNEL); if (!ids) return -ENOMEM; if (copy_from_user(ids, u64_to_user_ptr(args->objs), @@ -867,8 +872,10 @@ static int setup_wait(struct winesync_device *dev, kfree(ids); return -EFAULT; } + if (args->alert) + ids[count] = args->alert; - q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); + q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL); if (!q) { kfree(ids); return -ENOMEM; @@ -880,7 +887,7 @@ static int setup_wait(struct winesync_device *dev, q->ownerdead = false; q->count = count; - for (i = 0; i < count; i++) { + for (i = 0; i < total_count; i++) { struct winesync_q_entry *entry = &q->entries[i]; struct winesync_obj *obj = get_obj(dev, ids[i]); @@ -935,9 +942,9 @@ static int winesync_wait_any(struct winesync_device *dev, void __user *argp) { struct winesync_wait_args args; struct winesync_q *q; + __u32 i, total_count; ktime_t timeout; int signaled; - __u32 i; int ret; if (copy_from_user(&args, argp, sizeof(args))) @@ -947,9 +954,13 @@ static int winesync_wait_any(struct winesync_device *dev, void __user *argp) if (ret < 0) return ret; + total_count = args.count; + if (args.alert) + total_count++; + /* queue ourselves */ - for (i = 0; i < args.count; i++) { + for (i = 0; i < total_count; i++) { struct winesync_q_entry *entry = &q->entries[i]; struct winesync_obj *obj = entry->obj; @@ -958,9 +969,15 @@ static int winesync_wait_any(struct winesync_device *dev, void __user *argp) spin_unlock(&obj->lock); } - /* check if we are already signaled */ + /* + * Check if we are already signaled. + * + * Note that the API requires that normal objects are checked before + * the alert event. Hence we queue the alert event last, and check + * objects in order. + */ - for (i = 0; i < args.count; i++) { + for (i = 0; i < total_count; i++) { struct winesync_obj *obj = q->entries[i].obj; if (atomic_read(&q->signaled) != -1) @@ -977,7 +994,7 @@ static int winesync_wait_any(struct winesync_device *dev, void __user *argp) /* and finally, unqueue */ - for (i = 0; i < args.count; i++) { + for (i = 0; i < total_count; i++) { struct winesync_q_entry *entry = &q->entries[i]; struct winesync_obj *obj = entry->obj; @@ -1037,6 +1054,14 @@ static int winesync_wait_all(struct winesync_device *dev, void __user *argp) */ list_add_tail(&entry->node, &obj->all_waiters); } + if (args.alert) { + struct winesync_q_entry *entry = &q->entries[args.count]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_add_tail(&entry->node, &obj->any_waiters); + spin_unlock(&obj->lock); + } /* check if we are already signaled */ @@ -1044,6 +1069,21 @@ static int winesync_wait_all(struct winesync_device *dev, void __user *argp) spin_unlock(&dev->wait_all_lock); + /* + * Check if the alert event is signaled, making sure to do so only + * after checking if the other objects are signaled. + */ + + if (args.alert) { + struct winesync_obj *obj = q->entries[args.count].obj; + + if (atomic_read(&q->signaled) == -1) { + spin_lock(&obj->lock); + try_wake_any_obj(obj); + spin_unlock(&obj->lock); + } + } + /* sleep */ ret = winesync_schedule(q, args.timeout ? &timeout : NULL); @@ -1066,6 +1106,16 @@ static int winesync_wait_all(struct winesync_device *dev, void __user *argp) put_obj(obj); } + if (args.alert) { + struct winesync_q_entry *entry = &q->entries[args.count]; + struct winesync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_del(&entry->node); + spin_unlock(&obj->lock); + + put_obj(obj); + } spin_unlock(&dev->wait_all_lock); diff --git a/include/uapi/linux/winesync.h b/include/uapi/linux/winesync.h index fb3788339ffe9a..5b4e369f74693a 100644 --- a/include/uapi/linux/winesync.h +++ b/include/uapi/linux/winesync.h @@ -34,7 +34,7 @@ struct winesync_wait_args { __u32 count; __u32 owner; __u32 index; - __u32 pad; + __u32 alert; }; #define WINESYNC_IOC_BASE 0xf7 From 0154c74c6092d0cff42d6b4a08ab070c98d97cc0 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 20 Apr 2022 18:08:37 -0500 Subject: [PATCH 32/57] selftests: winesync: Add tests for alertable waits. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 191 +++++++++++++++++- 1 file changed, 188 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index f2e18836c73372..a87e3c48709b7c 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -110,7 +110,7 @@ static int read_event_state(int fd, __u32 event, __u32 *signaled, __u32 *manual) }) static int wait_objs(int fd, unsigned long request, __u32 count, - const __u32 *objs, __u32 owner, __u32 *index) + const __u32 *objs, __u32 owner, __u32 alert, __u32 *index) { struct winesync_wait_args args = {0}; struct timespec timeout; @@ -123,6 +123,7 @@ static int wait_objs(int fd, unsigned long request, __u32 count, args.objs = (uintptr_t)objs; args.owner = owner; args.index = 0xdeadbeef; + args.alert = alert; ret = ioctl(fd, request, &args); *index = args.index; return ret; @@ -131,13 +132,29 @@ static int wait_objs(int fd, unsigned long request, __u32 count, static int wait_any(int fd, __u32 count, const __u32 *objs, __u32 owner, __u32 *index) { - return wait_objs(fd, WINESYNC_IOC_WAIT_ANY, count, objs, owner, index); + return wait_objs(fd, WINESYNC_IOC_WAIT_ANY, + count, objs, owner, 0, index); } static int wait_all(int fd, __u32 count, const __u32 *objs, __u32 owner, __u32 *index) { - return wait_objs(fd, WINESYNC_IOC_WAIT_ALL, count, objs, owner, index); + return wait_objs(fd, WINESYNC_IOC_WAIT_ALL, + count, objs, owner, 0, index); +} + +static int wait_any_alert(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 alert, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ANY, + count, objs, owner, alert, index); +} + +static int wait_all_alert(int fd, __u32 count, const __u32 *objs, + __u32 owner, __u32 alert, __u32 *index) +{ + return wait_objs(fd, WINESYNC_IOC_WAIT_ALL, + count, objs, owner, alert, index); } TEST(semaphore_state) @@ -1225,4 +1242,172 @@ TEST(wake_all) close(fd); } +TEST(alert_any) +{ + struct winesync_event_args event_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2], index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[0] = sem_args.sem; + + sem_args.count = 1; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[1] = sem_args.sem; + + event_args.manual = true; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + sem_args.sem = objs[0]; + sem_args.count = 1; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[0]); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[1]); + EXPECT_EQ(0, ret); + + close(fd); +} + +TEST(alert_all) +{ + struct winesync_event_args event_args = {0}; + struct winesync_sem_args sem_args = {0}; + __u32 objs[2], index; + int fd, ret; + + fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[0] = sem_args.sem; + + sem_args.count = 1; + sem_args.max = 2; + sem_args.sem = 0xdeadbeef; + ret = ioctl(fd, WINESYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(0, ret); + EXPECT_NE(0xdeadbeef, sem_args.sem); + objs[1] = sem_args.sem; + + event_args.manual = true; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; + ret = ioctl(fd, WINESYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_EQ(0, ret); + + sem_args.sem = objs[1]; + sem_args.count = 2; + ret = ioctl(fd, WINESYNC_IOC_PUT_SEM, &sem_args); + EXPECT_EQ(0, ret); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); + EXPECT_EQ(0, ret); + + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[0]); + EXPECT_EQ(0, ret); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &objs[1]); + EXPECT_EQ(0, ret); + + close(fd); +} + TEST_HARNESS_MAIN From 72990bdac9cba8cf056cd8f579afa78f80f8b8b4 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 20 Apr 2022 18:24:43 -0500 Subject: [PATCH 33/57] serftests: winesync: Add some tests for wakeup signaling via alerts. Signed-off-by: Kai Krakow --- .../selftests/drivers/winesync/winesync.c | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tools/testing/selftests/drivers/winesync/winesync.c b/tools/testing/selftests/drivers/winesync/winesync.c index a87e3c48709b7c..169e922484b008 100644 --- a/tools/testing/selftests/drivers/winesync/winesync.c +++ b/tools/testing/selftests/drivers/winesync/winesync.c @@ -1245,8 +1245,12 @@ TEST(wake_all) TEST(alert_any) { struct winesync_event_args event_args = {0}; + struct winesync_wait_args wait_args = {0}; struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + struct timespec timeout; __u32 objs[2], index; + pthread_t thread; int fd, ret; fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); @@ -1295,6 +1299,35 @@ TEST(alert_any) EXPECT_EQ(0, ret); EXPECT_EQ(2, index); + /* test wakeup via alert */ + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; + wait_args.alert = event_args.event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ANY; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); EXPECT_EQ(0, ret); @@ -1336,8 +1369,12 @@ TEST(alert_any) TEST(alert_all) { struct winesync_event_args event_args = {0}; + struct winesync_wait_args wait_args = {0}; struct winesync_sem_args sem_args = {0}; + struct wait_args thread_args; + struct timespec timeout; __u32 objs[2], index; + pthread_t thread; int fd, ret; fd = open("/dev/winesync", O_CLOEXEC | O_RDONLY); @@ -1372,6 +1409,35 @@ TEST(alert_all) EXPECT_EQ(0, ret); EXPECT_EQ(2, index); + /* test wakeup via alert */ + + ret = ioctl(fd, WINESYNC_IOC_RESET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + get_abs_timeout(&timeout, CLOCK_MONOTONIC, 1000); + wait_args.timeout = (uintptr_t)&timeout; + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; + wait_args.alert = event_args.event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = WINESYNC_IOC_WAIT_ALL; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(fd, WINESYNC_IOC_SET_EVENT, &event_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + ret = ioctl(fd, WINESYNC_IOC_DELETE, &event_args.event); EXPECT_EQ(0, ret); From 9b1209de561a963621fd8df49ab45b00bb5d8795 Mon Sep 17 00:00:00 2001 From: Zebediah Figura Date: Wed, 20 Apr 2022 18:58:17 -0500 Subject: [PATCH 34/57] docs: winesync: Document alertable waits. Signed-off-by: Kai Krakow --- Documentation/userspace-api/winesync.rst | 40 ++++++++++++++++++------ 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/Documentation/userspace-api/winesync.rst b/Documentation/userspace-api/winesync.rst index ffa2f8fbc7e3d0..f0110d2744c709 100644 --- a/Documentation/userspace-api/winesync.rst +++ b/Documentation/userspace-api/winesync.rst @@ -354,9 +354,13 @@ The ioctls are as follows: ``EINVAL``. * - ``index`` - On success, contains the index (into ``objs``) of the object - which was signaled. - * - ``pad`` - - This field is not used and must be set to zero. + which was signaled. If ``alert`` was signaled instead, + this contains ``count``. + * - ``alert`` + - Optional event object identifier. If nonzero, this specifies + an "alert" event object which, if signaled, will terminate + the wait. If nonzero, the identifier must point to a valid + event. This function attempts to acquire one of the given objects. If unable to do so, it sleeps until an object becomes signaled, @@ -385,9 +389,19 @@ The ioctls are as follows: the given owner (with a recursion count of 1) and as no longer inconsistent, and ``index`` is still set to the index of the mutex. - It is valid to pass the same object more than once. If a wakeup - occurs due to that object being signaled, ``index`` is set to the - lowest index corresponding to that object. + The ``alert`` argument is an "extra" event which can terminate the + wait, independently of all other objects. If members of ``objs`` and + ``alert`` are both simultaneously signaled, a member of ``objs`` + will always be given priority and acquired first. Aside from this, + for "any" waits, there is no difference between passing an event as + this parameter, and passing it as an additional object at the end of + the ``objs`` array. For "all" waits, there is an additional + difference, as described below. + + It is valid to pass the same object more than once, including by + passing the same event in the ``objs`` array and in ``alert``. If a + wakeup occurs due to that object being signaled, ``index`` is set to + the lowest index corresponding to that object. The function may fail with ``EINTR`` if a signal is received. @@ -396,7 +410,7 @@ The ioctls are as follows: Poll on a list of objects, atomically acquiring all of them. Takes a pointer to struct :c:type:`winesync_wait_args`, which is used identically to ``WINESYNC_IOC_WAIT_ANY``, except that ``index`` is - always filled with zero on success. + always filled with zero on success if not woken via alert. This function attempts to simultaneously acquire all of the given objects. If unable to do so, it sleeps until all objects become @@ -417,6 +431,14 @@ The ioctls are as follows: objects are specified, there is no way to know which were marked as inconsistent. + As with "any" waits, the ``alert`` argument is an "extra" event + which can terminate the wait. Critically, however, an "all" wait + will succeed if all members in ``objs`` are signaled, *or* if + ``alert`` is signaled. In the latter case ``index`` will be set to + ``count``. As with "any" waits, if both conditions are filled, the + former takes priority, and objects in ``objs`` will be acquired. + Unlike ``WINESYNC_IOC_WAIT_ANY``, it is not valid to pass the same - object more than once. If this is attempted, the function fails with - ``EINVAL``. + object more than once, nor is it valid to pass the same object in + ``objs`` and in ``alert`` If this is attempted, the function fails + with ``EINVAL``. From 21c165023279a91d8b224d80b6343c0ba0587ae1 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 7 Dec 2016 21:13:16 +1100 Subject: [PATCH 35/57] Make threaded IRQs optionally the default which can be disabled. Signed-off-by: Kai Krakow --- include/linux/interrupt.h | 3 +++ kernel/irq/Kconfig | 17 +++++++++++++++++ kernel/irq/manage.c | 11 +++++++++++ 3 files changed, 31 insertions(+) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 4a1dc88ddbff9a..74d33218cada80 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -509,6 +509,9 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) +# elif defined(CONFIG_FORCE_IRQ_THREADING) +DECLARE_STATIC_KEY_TRUE(force_irqthreads_key); +# define force_irqthreads() (static_branch_likely(&force_irqthreads_key)) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 2531f3496ab6d7..ab1b43161837be 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -112,6 +112,23 @@ config GENERIC_IRQ_RESERVATION_MODE config IRQ_FORCED_THREADING bool +config FORCE_IRQ_THREADING + bool "Make IRQ threading compulsory" + depends on IRQ_FORCED_THREADING + default n + help + + Make IRQ threading mandatory for any IRQ handlers that support it + instead of being optional and requiring the threadirqs kernel + parameter. Instead they can be optionally disabled with the + nothreadirqs kernel parameter. + + Enabling this may make some architectures not boot with runqueue + sharing and MuQSS. + + Enable if you are building for a desktop or low latency system, + otherwise say N. + config SPARSE_IRQ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ help diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a054cd5ec08bce..7b3f997fec3e37 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -25,7 +25,18 @@ #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_FORCE_IRQ_THREADING +DEFINE_STATIC_KEY_TRUE(force_irqthreads_key); +#else DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); +#endif + +static int __init setup_noforced_irqthreads(char *arg) +{ + static_branch_disable(&force_irqthreads_key); + return 0; +} +early_param("nothreadirqs", setup_noforced_irqthreads); static int __init setup_forced_irqthreads(char *arg) { From 60f081c3685e1a0306886f1c17bc404568f7d5e6 Mon Sep 17 00:00:00 2001 From: Paul Gofman Date: Wed, 6 May 2020 14:37:44 +0300 Subject: [PATCH 36/57] mm: Support soft dirty flag reset for VA range. v2: ported from 6.1 to 6.6 Signed-off-by: Kai Krakow --- fs/proc/task_mmu.c | 129 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 103 insertions(+), 26 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 59571737e16771..b2572b88de8006 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1077,6 +1077,8 @@ enum clear_refs_types { struct clear_refs_private { enum clear_refs_types type; + unsigned long start, end; + bool clear_range; }; #ifdef CONFIG_MEM_SOFT_DIRTY @@ -1168,6 +1170,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; + BUG_ON(addr < cp->start || end > cp->end); + ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { @@ -1225,9 +1229,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; - if (vma->vm_flags & VM_PFNMAP) + if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP)) return 1; + BUG_ON(start < cp->start || end > cp->end); + /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. @@ -1251,10 +1257,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[18]; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; + unsigned long start, end; + bool clear_range; int itype; int rv; @@ -1263,12 +1271,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &itype); - if (rv < 0) - return rv; - type = (enum clear_refs_types)itype; - if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) - return -EINVAL; + + if (buffer[0] == '6') + { + static int once; + + if (!once++) + printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n"); + + if (count != 17) + return -EINVAL; + + type = CLEAR_REFS_SOFT_DIRTY; + start = *(unsigned long *)(buffer + 1); + end = *(unsigned long *)(buffer + 1 + 8); + } + else + { + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum clear_refs_types)itype; + + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) + return -EINVAL; + + start = 0; + end = -1UL; + } task = get_proc_task(file_inode(file)); if (!task) @@ -1281,40 +1311,86 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .type = type, }; - if (mmap_write_lock_killable(mm)) { - count = -EINTR; - goto out_mm; + if (start || end != -1UL) + { + start = min(start, -1UL) & PAGE_MASK; + end = min(end, -1UL) & PAGE_MASK; + + if (start >= end) + { + count = -EINVAL; + goto out_mm; + } + clear_range = true; } + else + { + clear_range = false; + } + + cp.start = start; + cp.end = end; + cp.clear_range = clear_range; + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); - goto out_unlock; + mmap_write_unlock(mm); + goto out_mm; } if (type == CLEAR_REFS_SOFT_DIRTY) { - for_each_vma(vmi, vma) { - if (!(vma->vm_flags & VM_SOFTDIRTY)) - continue; - vm_flags_clear(vma, VM_SOFTDIRTY); - vma_set_page_prot(vma); + if (mmap_read_lock_killable(mm)) { + count = -EINTR; + goto out_mm; } - + if (!clear_range) + for_each_vma(vmi, vma) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + for_each_vma(vmi, vma) { + vm_flags_clear(vma, VM_SOFTDIRTY); + vma_set_page_prot(vma); + } + mmap_write_downgrade(mm); + break; + } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, mm, 0, -1UL); + 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); + else + { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + } + walk_page_range(mm, start, end == -1UL ? -1 : end, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); + mmap_read_unlock(mm); + } + else + { + mmap_write_unlock(mm); } -out_unlock: - mmap_write_unlock(mm); out_mm: mmput(mm); } @@ -1346,6 +1422,7 @@ struct pagemapread { #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_SOFT_DIRTY_PAGE BIT_ULL(57) #define PM_UFFD_WP BIT_ULL(57) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) @@ -1418,13 +1495,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); @@ -1483,7 +1560,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) @@ -1504,7 +1581,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); From 72ffbb5641637f372698d0ddea263cb8b9cb5ad0 Mon Sep 17 00:00:00 2001 From: Paul Gofman Date: Thu, 7 May 2020 14:05:31 +0300 Subject: [PATCH 37/57] mm: Support soft dirty flag read with reset. v2: ported from 6.1 to 6.6 Signed-off-by: Kai Krakow --- fs/proc/base.c | 3 + fs/proc/internal.h | 1 + fs/proc/task_mmu.c | 139 +++++++++++++++++++++++++++++++++++++++------ 3 files changed, 127 insertions(+), 16 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 6e61d93ffa5523..417c0d720f05c8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3284,6 +3284,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), +#ifdef CONFIG_MEM_SOFT_DIRTY + REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations), +#endif #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff569..f3a16b26dd6e43 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -303,6 +303,7 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_pagemap_reset_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b2572b88de8006..bcabc55ed83859 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1099,7 +1099,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return page_maybe_dma_pinned(page); } -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { /* @@ -1109,37 +1109,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); + bool ret = false; if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) - return; + return ret; old_pte = ptep_modify_prot_start(vma, addr, pte); + ret = pte_soft_dirty(old_pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { + ret = pte_swp_soft_dirty(ptent); ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } + return ret; } #else -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + return false; } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + bool ret = false; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); + + ret = pmd_soft_dirty(old); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) @@ -1150,14 +1159,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + ret = pmd_swp_soft_dirty(pmd); pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } + return ret; } #else -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { + return false; } #endif @@ -1412,6 +1424,7 @@ struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; + bool reset; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -1443,6 +1456,14 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) return 0; } +static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm) +{ + ((unsigned long *)pm->buffer)[pm->pos++] = addr; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { @@ -1450,6 +1471,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, unsigned long addr = start; int err = 0; + if (pm->reset) + goto out; + while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); @@ -1552,6 +1576,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; struct page *page = NULL; + if (pm->reset) + { + if (clear_soft_dirty_pmd(vma, addr, pmdp)) + { + for (; addr != end; addr += PAGE_SIZE) + { + err = add_addr_to_pagemap(addr, pm); + if (err) + break; + } + } + goto trans_huge_done; + } + if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1611,6 +1649,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, frame += (1 << MAX_SWAPFILES_SHIFT); } } +trans_huge_done: spin_unlock(ptl); return err; } @@ -1626,10 +1665,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { - pagemap_entry_t pme; + if (pm->reset) + { + if (clear_soft_dirty(vma, addr, pte)) + err = add_addr_to_pagemap(addr, pm); + } + else + { + pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); - err = add_to_pagemap(&pme, pm); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); + err = add_to_pagemap(&pme, pm); + } if (err) break; } @@ -1728,8 +1775,8 @@ static const struct mm_walk_ops pagemap_ops = { * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -static ssize_t pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t do_pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, bool reset) { struct mm_struct *mm = file->private_data; struct pagemapread pm; @@ -1738,6 +1785,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; + struct mmu_notifier_range range; + size_t buffer_len; if (!mm || !mmget_not_zero(mm)) goto out; @@ -1753,19 +1802,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); + pm.reset = reset; - pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); + buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES); + + pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; - end_vaddr = mm->task_size; + + start_vaddr = svpfn << PAGE_SHIFT; + + if (reset) + { + if (count < sizeof(end_vaddr)) + { + ret = -EINVAL; + goto out_mm; + } + if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr))) + return -EFAULT; + end_vaddr = min(end_vaddr, mm->task_size); + } + else + { + end_vaddr = mm->task_size; + start_vaddr = end_vaddr; + } /* watch out for wraparound */ - start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; @@ -1790,18 +1858,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + pm.len = min(buffer_len, count / PM_ENTRY_BYTES); + + end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT)); /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; + + if (reset) + { + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, mm, start_vaddr, end); + mmu_notifier_invalidate_range_start(&range); + } ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); + if (reset) + { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); + } mmap_read_unlock(mm); - start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); + BUG_ON(ret && ret != PM_END_OF_BUFFER); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; @@ -1809,6 +1894,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, copied += len; buf += len; count -= len; + + start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) @@ -1822,6 +1909,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, return ret; } +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, false); +} + +static ssize_t pagemap_reset_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, true); +} + static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; @@ -1848,6 +1947,14 @@ const struct file_operations proc_pagemap_operations = { .open = pagemap_open, .release = pagemap_release, }; + +const struct file_operations proc_pagemap_reset_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_reset_read, + .open = pagemap_open, + .release = pagemap_release, +}; + #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA From 4b991a59c727c98be07504d8d32a82a67365f0bb Mon Sep 17 00:00:00 2001 From: Mark Weiman Date: Sun, 12 Aug 2018 11:36:21 -0400 Subject: [PATCH 38/57] pci: Enable overrides for missing ACS capabilities This an updated version of Alex Williamson's patch from: https://lkml.org/lkml/2013/5/30/513 Original commit message follows: PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that allows us to control whether transactions are allowed to be redirected in various subnodes of a PCIe topology. For instance, if two endpoints are below a root port or downsteam switch port, the downstream port may optionally redirect transactions between the devices, bypassing upstream devices. The same can happen internally on multifunction devices. The transaction may never be visible to the upstream devices. One upstream device that we particularly care about is the IOMMU. If a redirection occurs in the topology below the IOMMU, then the IOMMU cannot provide isolation between devices. This is why the PCIe spec encourages topologies to include ACS support. Without it, we have to assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. Unfortunately, far too many topologies do not support ACS to make this a steadfast requirement. Even the latest chipsets from Intel are only sporadically supporting ACS. We have trouble getting interconnect vendors to include the PCIe spec required PCIe capability, let alone suggested features. Therefore, we need to add some flexibility. The pcie_acs_override= boot option lets users opt-in specific devices or sets of devices to assume ACS support. The "downstream" option assumes full ACS support on root ports and downstream switch ports. The "multifunction" option assumes the subset of ACS features available on multifunction endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" option enables ACS support on devices matching the provided vendor and device IDs, allowing more strategic ACS overrides. These options may be combined in any order. A maximum of 16 id specific overrides are available. It's suggested to use the most limited set of options necessary to avoid completely disabling ACS across the topology. Note to hardware vendors, we have facilities to permanently quirk specific devices which enforce isolation but not provide an ACS capability. Please contact me to have your devices added and save your customers the hassle of this boot option. Signed-off-by: Mark Weiman --- .../admin-guide/kernel-parameters.txt | 9 ++ drivers/pci/quirks.c | 103 ++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8d2f9ed3f1076e..4da131bf03d2ba 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4310,6 +4310,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. + pcie_acs_override = + [PCIE] Override missing PCIe ACS support for: + downstream + All downstream ports - full ACS capabilities + multifunction + All multifunction devices - multifunction ACS subset + id:nnnn:nnnn + Specific device - full ACS capabilities + Specified as vid:did (vendor/device ID) in hex noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index ec4277d7835b23..6b84a6f7ee6e45 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3488,6 +3488,107 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65f8, quirk_intel_mc_errata); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65f9, quirk_intel_mc_errata); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65fa, quirk_intel_mc_errata); +static bool acs_on_downstream; +static bool acs_on_multifunction; + +#define NUM_ACS_IDS 16 +struct acs_on_id { + unsigned short vendor; + unsigned short device; +}; +static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; +static u8 max_acs_id; + +static __init int pcie_acs_override_setup(char *p) +{ + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "downstream", 10)) + acs_on_downstream = true; + if (!strncmp(p, "multifunction", 13)) + acs_on_multifunction = true; + if (!strncmp(p, "id:", 3)) { + char opt[5]; + int ret; + long val; + + if (max_acs_id >= NUM_ACS_IDS - 1) { + pr_warn("Out of PCIe ACS override slots (%d)\n", + NUM_ACS_IDS); + goto next; + } + + p += 3; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].vendor = val; + + p += strcspn(p, ":"); + if (*p != ':') { + pr_warn("PCIe ACS invalid ID\n"); + goto next; + } + + p++; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].device = val; + max_acs_id++; + } +next: + p += strcspn(p, ","); + if (*p == ',') + p++; + } + + if (acs_on_downstream || acs_on_multifunction || max_acs_id) + pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); + + return 0; +} +early_param("pcie_acs_override", pcie_acs_override_setup); + +static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) +{ + int i; + + /* Never override ACS for legacy devices or devices with ACS caps */ + if (!pci_is_pcie(dev) || + pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) + return -ENOTTY; + + for (i = 0; i < max_acs_id; i++) + if (acs_on_ids[i].vendor == dev->vendor && + acs_on_ids[i].device == dev->device) + return 1; + + switch (pci_pcie_type(dev)) { + case PCI_EXP_TYPE_DOWNSTREAM: + case PCI_EXP_TYPE_ROOT_PORT: + if (acs_on_downstream) + return 1; + break; + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_LEG_END: + case PCI_EXP_TYPE_RC_END: + if (acs_on_multifunction && dev->multifunction) + return 1; + } + + return -ENOTTY; +} + /* * Ivytown NTB BAR sizes are misreported by the hardware due to an erratum. * To work around this, query the size it should be configured to by the @@ -5136,6 +5237,8 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, /* Wangxun nics */ { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, + /* custom ACS overrides for any PCIe device */ + { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, { 0 } }; From 306984ad3f4c7d08cb37a3f83f2f15d60e5d2649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 25 Oct 2021 09:49:42 -0300 Subject: [PATCH 39/57] futex: Add entry point for FUTEX_WAIT_MULTIPLE (opcode 31) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an option to wait on multiple futexes using the old interface, that uses opcode 31 through futex() syscall. Do that by just translation the old interface to use the new code. This allows old and stable versions of Proton to still use fsync in new kernel releases. Signed-off-by: André Almeida --- include/uapi/linux/futex.h | 13 +++++++ kernel/futex/syscalls.c | 75 +++++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 71a5df8d26898b..d375ab21cbf83f 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -22,6 +22,7 @@ #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_LOCK_PI2 13 +#define FUTEX_WAIT_MULTIPLE 31 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -68,6 +69,18 @@ struct futex_waitv { __u32 __reserved; }; +/** + * struct futex_wait_block - Block of futexes to be waited for + * @uaddr: User address of the futex + * @val: Futex value expected by userspace + * @bitset: Bitset for the optional bitmasked wakeup + */ +struct futex_wait_block { + __u32 __user *uaddr; + __u32 val; + __u32 bitset; +}; + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index a8074079b09e87..26d6da72d494c6 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -142,6 +142,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_WAIT_MULTIPLE: return true; } return false; @@ -154,13 +155,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) return -EINVAL; *t = timespec64_to_ktime(*ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } +/** + * futex_read_wait_block - Read an array of futex_wait_block from userspace + * @uaddr: Userspace address of the block + * @count: Number of blocks to be read + * + * This function creates and allocate an array of futex_q (we zero it to + * initialize the fields) and then, for each futex_wait_block element from + * userspace, fill a futex_q element with proper values. + */ +inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count) +{ + unsigned int i; + struct futex_vector *futexv; + struct futex_wait_block fwb; + struct futex_wait_block __user *entry = + (struct futex_wait_block __user *)uaddr; + + if (!count || count > FUTEX_WAITV_MAX) + return ERR_PTR(-EINVAL); + + futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { + kfree(futexv); + return ERR_PTR(-EFAULT); + } + + futexv[i].w.flags = FUTEX_32; + futexv[i].w.val = fwb.val; + futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr); + futexv[i].q = futex_q_init; + } + + return futexv; +} + +int futex_wait_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to); + +int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count) +{ + int ret; + struct futex_vector *vs; + struct hrtimer_sleeper *to = NULL, timeout; + + to = futex_setup_timer(abs_time, &timeout, 0, 0); + + vs = futex_read_wait_block(uaddr, count); + + if (IS_ERR(vs)) + return PTR_ERR(vs); + + ret = futex_wait_multiple(vs, count, abs_time ? to : NULL); + kfree(vs); + + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + + return ret; +} + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) @@ -180,6 +247,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } @@ -373,6 +443,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ From 0f1cc33b8e9a49a8ee6cdbd9d5ec40e556782eb1 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:10:06 +0100 Subject: [PATCH 40/57] ZEN: INTERACTIVE: Base config item Signed-off-by: Kai Krakow --- init/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index e403a292563573..b641f9074047c7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -132,6 +132,12 @@ config THREAD_INFO_IN_TASK menu "General setup" +config ZEN_INTERACTIVE + bool "Tune kernel for interactivity" + default y + help + Tunes the kernel for responsiveness at the cost of throughput and power usage. + config BROKEN bool From d0a264e4b13cbd54673b14f04acd01459356c009 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:11:05 +0100 Subject: [PATCH 41/57] ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices Signed-off-by: Kai Krakow --- block/elevator.c | 4 ++++ init/Kconfig | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index 5ff093cb3cf8f5..cd1a2c9881bc60 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -576,7 +576,11 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; +#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) + return elevator_find_get(q, "bfq"); +#else return elevator_find_get(q, "mq-deadline"); +#endif } /* diff --git a/init/Kconfig b/init/Kconfig index b641f9074047c7..89d4f52e3f6721 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -138,6 +138,10 @@ config ZEN_INTERACTIVE help Tunes the kernel for responsiveness at the cost of throughput and power usage. + --- Block Layer ---------------------------------------- + + Default scheduler for SQ..: mq-deadline -> bfq + config BROKEN bool From 04e1674bc0a790bc697a7380bdcdaa26f51d6ccf Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 12 Dec 2022 00:03:03 +0100 Subject: [PATCH 42/57] ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices Signed-off-by: Kai Krakow --- block/elevator.c | 6 ++++++ init/Kconfig | 1 + 2 files changed, 7 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index cd1a2c9881bc60..200eb60c8e8b52 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -574,7 +574,13 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) +#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER) + return elevator_find_get(q, "kyber"); +#elif defined(CONFIG_ZEN_INTERACTIVE) + return elevator_find_get(q, "mq-deadline"); +#else return NULL; +#endif #if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) return elevator_find_get(q, "bfq"); diff --git a/init/Kconfig b/init/Kconfig index 89d4f52e3f6721..9bc110aa31334b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -141,6 +141,7 @@ config ZEN_INTERACTIVE --- Block Layer ---------------------------------------- Default scheduler for SQ..: mq-deadline -> bfq + Default scheduler for MQ..: none -> kyber config BROKEN bool From c63152242d2ccea22bc8857e21b42ba0f2887dff Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:21:09 +0100 Subject: [PATCH 43/57] ZEN: INTERACTIVE: Enable background reclaim of hugepages Use [defer+madvise] as default khugepaged defrag strategy: For some reason, the default strategy to respond to THP fault fallbacks is still just madvise, meaning stall if the program wants transparent hugepages, but don't trigger a background reclaim / compaction if THP begins to fail allocations. This creates a snowball affect where we still use the THP code paths, but we almost always fail once a system has been active and busy for a while. The option "defer" was created for interactive systems where THP can still improve performance. If we have to fallback to a regular page due to an allocation failure or anything else, we will trigger a background reclaim and compaction so future THP attempts succeed and previous attempts eventually have their smaller pages combined without stalling running applications. We still want madvise to stall applications that explicitely want THP, so defer+madvise _does_ make a ton of sense. Make it the default for interactive systems, especially if the kernel maintainer left transparent hugepages on "always". Reasoning and details in the original patch: https://lwn.net/Articles/711248/ Signed-off-by: Kai Krakow --- init/Kconfig | 4 ++++ mm/huge_memory.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 9bc110aa31334b..a43a5efdc7bcd5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -143,6 +143,10 @@ config ZEN_INTERACTIVE Default scheduler for SQ..: mq-deadline -> bfq Default scheduler for MQ..: none -> kyber + --- Virtual Memory Subsystem --------------------------- + + Background-reclaim hugepages...: no -> yes + config BROKEN bool diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79fbd6ddec49f5..21911157436db0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -61,7 +61,11 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1< Date: Wed, 11 Aug 2021 18:47:46 -0500 Subject: [PATCH 44/57] ZEN: INTERACTIVE: Tune mgLRU to protect cache used in the last second Although not identical to the le9 patches that protect a byte-amount of cache through tunables, multigenerational LRU now supports protecting cache accessed in the last X milliseconds. In #218, Yu recommends starting with 1000ms and tuning as needed. This looks like a safe default and turning on this feature should help users that don't know they need it. Signed-off-by: Kai Krakow --- init/Kconfig | 1 + mm/vmscan.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index a43a5efdc7bcd5..aec371d72f67a0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -146,6 +146,7 @@ config ZEN_INTERACTIVE --- Virtual Memory Subsystem --------------------------- Background-reclaim hugepages...: no -> yes + MG-LRU minimum cache TTL.......: 0 -> 1000 ms config BROKEN bool diff --git a/mm/vmscan.c b/mm/vmscan.c index 83fa8e924f8aea..e9e2cf830daa1d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4619,7 +4619,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned long lru_gen_min_ttl __read_mostly = HZ; +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { From fd3dc4b983cd0469880b56bffa074576b9436a45 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Tue, 31 Oct 2023 19:03:10 +0100 Subject: [PATCH 45/57] ZEN: INTERACTIVE: Tune EEVDF for interactivity 5.7: Take "sysctl_sched_nr_migrate" tune from early XanMod builds of 128. As of 5.7, XanMod uses 256 but that may affect applications that require timely response to IRQs. 5.15: Per [a comment][1] on our ZEN INTERACTIVE commit, reducing the cost of migration causes the system less responsive under high load. Most likely the combination of reduced migration cost + the higher number of tasks that can be migrated at once contributes to this. To better handle this situation, restore the mainline migration cost value and also reduce the max number of tasks that can be migrated in batch from 128 to 64. If this doesn't help, we'll restore the reduced migration cost and keep total number of tasks that can be migrated at once to 32. [1]: https://github.com/zen-kernel/zen-kernel/commit/be5ba234ca0a5aabe74bfc7e1f636f085bd3823c#commitcomment-63159674 6.6: Port the tuning to EEVDF, which removed a couple of settings. Signed-off-by: Kai Krakow --- init/Kconfig | 6 ++++++ kernel/sched/fair.c | 9 +++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 17 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index aec371d72f67a0..27e8890d51af99 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -148,6 +148,12 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes MG-LRU minimum cache TTL.......: 0 -> 1000 ms + --- EEVDF CPU Scheduler -------------------------------- + + Minimal granularity............: 0.75 -> 0.4 ms + Bandwidth slice size...........: 5 -> 3 ms + Task rebalancing threshold.....: 32 -> 64 + config BROKEN bool diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2e1009e5706ee..dcbc243dbc26f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -75,8 +75,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_ZEN_INTERACTIVE +unsigned int sysctl_sched_base_slice = 400000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 400000ULL; +#else unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +#endif /* * After fork, child runs first. If set to 0 (default) then @@ -135,8 +140,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8cbbbea7fdbbd6..dee7857bc5215d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2520,6 +2520,8 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT #define SCHED_NR_MIGRATE_BREAK 8 +#elif defined(CONFIG_ZEN_INTERACTIVE) +#define SCHED_NR_MIGRATE_BREAK 64 #else #define SCHED_NR_MIGRATE_BREAK 32 #endif From 8de8eb945646295e7cbebde3d7c1a3aa4060e67e Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:27:16 +0100 Subject: [PATCH 46/57] ZEN: INTERACTIVE: Tune ondemand governor for interactivity 4.10: During some personal testing with the Dolphin emulator, MuQSS has serious problems scaling its frequencies causing poor performance where boosting the CPU frequencies would have fixed them. Reducing the up_threshold to 45 with MuQSS appears to fix the issue, letting the introduction to "Star Wars: Rogue Leader" run at 100% speed versus about 80% on my test system. Also, lets refactor the definitions and include some indentation to help the reader discern what the scope of all the macros are. 5.4: On the last custom kernel benchmark from Phoronix with Xanmod, Michael configured all the kernels to run using ondemand instead of the kernel's [default selection][1]. This reminded me that another option outside of the kernels control is the user's choice to change the cpufreq governor, for better or for worse. In Liquorix, performance is the default governor whether you're running acpi-cpufreq or intel-pstate. I expect laptop users to install TLP or LMT to control the power balance on their system, especially when they're plugged in or on battery. However, it's pretty clear to me a lot of people would choose ondemand over performance since it's not obvious it has huge performance ramifications with MuQSS, and ondemand otherwise is "good enough" for most people. Lets codify lower up thresholds for MuQSS to more closely synergize with its aggressive thread migration behavior. This way when ondemand is configured, you get sort of a "performance-lite" type of result but with the power savings you expect when leaving the running system idle. [1]: https://www.phoronix.com/scan.php?page=article&item=xanmod-2020-kernel 5.14: Although CFS and similar schedulers (BMQ, PDS, and CacULE), reuse a lot more of mainline scheduling and do a good job of pinning single threaded tasks to their respective core, there's still applications that confusingly run steady near 50% and benefit from going full speed or turbo when they need to run (emulators for more recent consoles come to mind). Drop the up threshold for all non-MuQSS schedulers from 80/95 to 55/60. 5.15: Remove MuQSS cpufreq configuration. Signed-off-by: Kai Krakow --- drivers/cpufreq/cpufreq_ondemand.c | 8 +++++++- init/Kconfig | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index c52d19d67557f5..03c8da688a8cdb 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -18,10 +18,16 @@ #include "cpufreq_ondemand.h" /* On-demand governor macros */ +#if defined(CONFIG_ZEN_INTERACTIVE) +#define DEF_FREQUENCY_UP_THRESHOLD (55) +#define MICRO_FREQUENCY_UP_THRESHOLD (60) +#define DEF_SAMPLING_DOWN_FACTOR (5) +#else #define DEF_FREQUENCY_UP_THRESHOLD (80) +#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define DEF_SAMPLING_DOWN_FACTOR (1) +#endif #define MAX_SAMPLING_DOWN_FACTOR (100000) -#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) #define MIN_FREQUENCY_UP_THRESHOLD (1) #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/init/Kconfig b/init/Kconfig index 27e8890d51af99..603f9780487141 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -154,6 +154,12 @@ config ZEN_INTERACTIVE Bandwidth slice size...........: 5 -> 3 ms Task rebalancing threshold.....: 32 -> 64 + --- CPUFreq Settings ----------------------------------- + + Ondemand sampling down factor..: 1 -> 5 + Ondemand default up threshold..: 80 -> 55 + Ondemand micro up threshold....: 95 -> 60 + config BROKEN bool From 2f971947f6c18ffa550efcf284a3f543c26e78c4 Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Sat, 5 Mar 2022 11:37:14 -0600 Subject: [PATCH 47/57] ZEN: INTERACTIVE: mm: Disable unevictable compaction This option is already disabled when CONFIG_PREEMPT_RT is enabled, lets turn it off when CONFIG_ZEN_INTERACTIVE is set as well. Signed-off-by: Kai Krakow --- init/Kconfig | 1 + mm/Kconfig | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 603f9780487141..4f1827d7bc7cbb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -147,6 +147,7 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes MG-LRU minimum cache TTL.......: 0 -> 1000 ms + Compact unevictable............: yes -> no --- EEVDF CPU Scheduler -------------------------------- diff --git a/mm/Kconfig b/mm/Kconfig index 264a2df5ecf5b9..025833f2442694 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -653,7 +653,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION - default 0 if PREEMPT_RT + default 0 if PREEMPT_RT || ZEN_INTERACTIVE default 1 # From cce1a04bb4692a6cf33dc90af09ffb8bf484ec78 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 24 Oct 2020 22:17:49 -0700 Subject: [PATCH 48/57] ZEN: INTERACTIVE: mm: Disable proactive compaction by default On-demand compaction works fine assuming that you don't have a need to spam the page allocator nonstop for large order page allocations. Signed-off-by: Sultan Alsawaf --- init/Kconfig | 1 + mm/compaction.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 4f1827d7bc7cbb..5fef432a7be6a1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -148,6 +148,7 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes MG-LRU minimum cache TTL.......: 0 -> 1000 ms Compact unevictable............: yes -> no + Compaction proactiveness.......: 20 -> 0 --- EEVDF CPU Scheduler -------------------------------- diff --git a/mm/compaction.c b/mm/compaction.c index 61c741f11e9bb3..fc959a11fee2d7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1809,7 +1809,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned int __read_mostly sysctl_compaction_proactiveness; +#else static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +#endif static int sysctl_extfrag_threshold = 500; static int __read_mostly sysctl_compact_memory; From 9eb30550354abef50734645e1d48e70299f081fe Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 28 Mar 2020 13:06:28 -0700 Subject: [PATCH 49/57] ZEN: INTERACTIVE: mm: Disable watermark boosting by default What watermark boosting does is preemptively fire up kswapd to free memory when there hasn't been an allocation failure. It does this by increasing kswapd's high watermark goal and then firing up kswapd. The reason why this causes freezes is because, with the increased high watermark goal, kswapd will steal memory from processes that need it in order to make forward progress. These processes will, in turn, try to allocate memory again, which will cause kswapd to steal necessary pages from those processes again, in a positive feedback loop known as page thrashing. When page thrashing occurs, your system is essentially livelocked until the necessary forward progress can be made to stop processes from trying to continuously allocate memory and trigger kswapd to steal it back. This problem already occurs with kswapd *without* watermark boosting, but it's usually only encountered on machines with a small amount of memory and/or a slow CPU. Watermark boosting just makes the existing problem worse enough to notice on higher spec'd machines. Disable watermark boosting by default since it's a total dumpster fire. I can't imagine why anyone would want to explicitly enable it, but the option is there in case someone does. Signed-off-by: Sultan Alsawaf --- init/Kconfig | 1 + mm/page_alloc.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 5fef432a7be6a1..9424020acbf16e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -149,6 +149,7 @@ config ZEN_INTERACTIVE MG-LRU minimum cache TTL.......: 0 -> 1000 ms Compact unevictable............: yes -> no Compaction proactiveness.......: 20 -> 0 + Watermark boost factor.........: 1.5 -> 0 --- EEVDF CPU Scheduler -------------------------------- diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e99d3223f0fc29..94f6dd38e898b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,7 +286,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_ZEN_INTERACTIVE +static int watermark_boost_factor __read_mostly; +#else static int watermark_boost_factor __read_mostly = 15000; +#endif static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ From 9b06903afef658ec0af97f1c9032af8f073c16b8 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 20 Oct 2021 20:50:11 -0700 Subject: [PATCH 50/57] ZEN: INTERACTIVE: mm: Lower the non-hugetlbpage pageblock size to reduce scheduling delays The page allocator processes free pages in groups of pageblocks, where the size of a pageblock is typically quite large (1024 pages without hugetlbpage support). Pageblocks are processed atomically with the zone lock held, which can cause severe scheduling delays on both the CPU going through the pageblock and any other CPUs waiting to acquire the zone lock. A frequent offender is move_freepages_block(), which is used by rmqueue() for page allocation. As it turns out, there's no requirement for pageblocks to be so large, so the pageblock order can simply be reduced to ease the scheduling delays and zone lock contention. PAGE_ALLOC_COSTLY_ORDER is used as a reasonable setting to ensure non-costly page allocation requests can still be serviced without always needing to free up more than one pageblock's worth of pages at a time. This has a noticeable effect on overall system latency when memory pressure is elevated. The various mm functions which operate on pageblocks no longer appear in the preemptoff tracer, where previously they would spend up to 100 ms on a mobile arm64 CPU processing a pageblock with preemption disabled and the zone lock held. Signed-off-by: Sultan Alsawaf --- include/linux/pageblock-flags.h | 4 ++++ init/Kconfig | 1 + 2 files changed, 5 insertions(+) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e83c4c09504173..a029ac2e355405 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -48,7 +48,11 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ +#ifdef CONFIG_ZEN_INTERACTIVE +#define pageblock_order PAGE_ALLOC_COSTLY_ORDER +#else #define pageblock_order MAX_ORDER +#endif #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/init/Kconfig b/init/Kconfig index 9424020acbf16e..248b037a3ce76b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -150,6 +150,7 @@ config ZEN_INTERACTIVE Compact unevictable............: yes -> no Compaction proactiveness.......: 20 -> 0 Watermark boost factor.........: 1.5 -> 0 + Pageblock order................: 10 -> 3 --- EEVDF CPU Scheduler -------------------------------- From 8abbf13187ca4fa4a30c07a3e1b6cca4a0806b2c Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Mon, 5 Sep 2022 11:35:20 -0500 Subject: [PATCH 51/57] ZEN: INTERACTIVE: mm/swap: Disable swap-in readahead Per an [issue][1] on the chromium project, swap-in readahead causes more jank than not. This might be caused by poor optimization on the swapping code, or the fact under memory pressure, we're pulling in pages we don't need, causing more swapping. Either way, this is mainline/upstream to Chromium, and ChromeOS developers care a lot about system responsiveness. Lets implement the same change so Zen Kernel users benefit. [1]: https://bugs.chromium.org/p/chromium/issues/detail?id=263561 Signed-off-by: Kai Krakow --- init/Kconfig | 1 + mm/swap.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 248b037a3ce76b..6118738dbd3cca 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -151,6 +151,7 @@ config ZEN_INTERACTIVE Compaction proactiveness.......: 20 -> 0 Watermark boost factor.........: 1.5 -> 0 Pageblock order................: 10 -> 3 + Swap-in readahead..............: 3 -> 0 --- EEVDF CPU Scheduler -------------------------------- diff --git a/mm/swap.c b/mm/swap.c index cd8f0150ba3aa8..ed791c2906aa51 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) */ void __init swap_setup(void) { +#ifdef CONFIG_ZEN_INTERACTIVE + /* Only swap-in pages requested, avoid readahead */ + page_cluster = 0; +#else unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ @@ -1101,4 +1105,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#endif } From e7d696e68dbfca244e8575a2e1c1ba7f1d0da399 Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Sat, 4 Nov 2023 13:04:03 -0500 Subject: [PATCH 52/57] ZEN: INTERACTIVE: eevdf: Rebalance more often, but less tasks at once Instead of increasing the number of tasks that migrate at once, migrate the amount acceptable for PREEMPT_RT, but reduce the cost so migrations occur more often. This should make CFS/EEVDF behave more like out-of-tree schedulers that aggressively use idle cores to reduce latency, but without the jank caused by rebalancing too many tasks at once. Signed-off-by: Kai Krakow --- init/Kconfig | 3 ++- kernel/sched/fair.c | 4 ++++ kernel/sched/sched.h | 4 +--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 6118738dbd3cca..2a54d857a8ee30 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -157,7 +157,8 @@ config ZEN_INTERACTIVE Minimal granularity............: 0.75 -> 0.4 ms Bandwidth slice size...........: 5 -> 3 ms - Task rebalancing threshold.....: 32 -> 64 + Task rebalancing threshold.....: 32 -> 8 + Migration cost.................: 0.5 -> 0.25 ms --- CPUFreq Settings ----------------------------------- diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dcbc243dbc26f7..64a09f4db8b773 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -89,7 +89,11 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; */ unsigned int sysctl_sched_child_runs_first __read_mostly; +#ifdef CONFIG_ZEN_INTERACTIVE +const_debug unsigned int sysctl_sched_migration_cost = 250000UL; +#else const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#endif int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dee7857bc5215d..b10c780d114c70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2518,10 +2518,8 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -#ifdef CONFIG_PREEMPT_RT +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE) #define SCHED_NR_MIGRATE_BREAK 8 -#elif defined(CONFIG_ZEN_INTERACTIVE) -#define SCHED_NR_MIGRATE_BREAK 64 #else #define SCHED_NR_MIGRATE_BREAK 32 #endif From 38273f1b4d249bbbcd9d5210e13b2b9ae1c3392d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 May 2023 13:20:46 +0200 Subject: [PATCH 53/57] ZEN: sched/fair: Multi-LLC select_idle_sibling() Tejun reported that when he targets workqueues towards a specific LLC on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets significant idle time. This is, of course, because of how select_idle_sibling() will not consider anything outside of the local LLC, and since all these tasks are short running the periodic idle load balancer is ineffective. And while it is good to keep work cache local, it is better to not have significant idle time. Therefore, have select_idle_sibling() try other LLCs inside the same node when the local one comes up empty. Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 1 + 2 files changed, 38 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 64a09f4db8b773..4a4bd8fd0a58fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7380,6 +7380,37 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return idle_cpu; } +/* + * For the multiple-LLC per node case, make sure to try the other LLC's if the + * local LLC comes up empty. + */ +static int +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct sched_domain *parent = sd->parent; + struct sched_group *sg; + + /* Make sure to not cross nodes. */ + if (!parent || parent->flags & SD_NUMA) + return -1; + + sg = parent->groups; + do { + int cpu = cpumask_first(sched_group_span(sg)); + + if (!cpus_share_cache(cpu, target)) { + int i = select_idle_cpu(p, per_cpu(sd_llc, cpu), + test_idle_cores(cpu), cpu); + if ((unsigned)i < nr_cpumask_bits) + return i; + } + + sg = sg->next; + } while (sg != parent->groups); + + return -1; +} + /* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to @@ -7552,6 +7583,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; + if (sched_feat(SIS_NODE)) { + i = select_idle_node(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + } + return target; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae4a..03ed013b23fd7a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -51,6 +51,7 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) +SCHED_FEAT(SIS_NODE, true) /* * Issue a WARN when we do multiple update_rq_clock() calls From 6f013894df548ef82ee31489b92025abd881903c Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Thu, 27 Apr 2023 14:43:57 -0500 Subject: [PATCH 54/57] ZEN: Set default max map count to (INT_MAX - 5) Per [Fedora][1], they intend to change the default max map count for their distribution to improve OOTB compatibility with games played through Steam/Proton. The value they picked comes from the Steam Deck, which defaults to INT_MAX - MAPCOUNT_ELF_CORE_MARGIN. Since most ZEN and Liquorix users probably play games, follow Valve's lead and raise this value to their default. [1]: https://fedoraproject.org/wiki/Changes/IncreaseVmMaxMapCount Signed-off-by: Kai Krakow --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d617d0d696751..ddc17bbd3ef6e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page) * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +#define DEFAULT_MAX_MAP_COUNT (INT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; From e44f07cbee56f73a4abddfdc5eb414a812f24fdc Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 20 Oct 2021 20:50:32 -0700 Subject: [PATCH 55/57] ZEN: mm: Don't hog the CPU and zone lock in rmqueue_bulk() There is noticeable scheduling latency and heavy zone lock contention stemming from rmqueue_bulk's single hold of the zone lock while doing its work, as seen with the preemptoff tracer. There's no actual need for rmqueue_bulk() to hold the zone lock the entire time; it only does so for supposed efficiency. As such, we can relax the zone lock and even reschedule when IRQs are enabled in order to keep the scheduling delays and zone lock contention at bay. Forward progress is still guaranteed, as the zone lock can only be relaxed after page removal. With this change, rmqueue_bulk() no longer appears as a serious offender in the preemptoff tracer, and system latency is noticeably improved. Signed-off-by: Sultan Alsawaf --- mm/page_alloc.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94f6dd38e898b8..817189eff88e23 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2124,16 +2124,17 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } /* - * Obtain a specified number of elements from the buddy allocator, all under - * a single hold of the lock, for efficiency. Add them to the supplied list. - * Returns the number of new pages which were placed at *list. + * Obtain a specified number of elements from the buddy allocator, and relax the + * zone lock when needed. Add them to the supplied list. Returns the number of + * new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + const bool can_resched = !preempt_count() && !irqs_disabled(); unsigned long flags; - int i; + int i, last_mod = 0; spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { @@ -2142,6 +2143,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + /* Reschedule and ease the contention on the lock if needed */ + if (i + 1 < count && ((can_resched && need_resched()) || + spin_needbreak(&zone->lock))) { + __mod_zone_page_state(zone, NR_FREE_PAGES, + -((i + 1 - last_mod) << order)); + last_mod = i + 1; + spin_unlock_irqrestore(&zone->lock, flags); + if (can_resched) + cond_resched(); + spin_lock_irqsave(&zone->lock, flags); + } + /* * Split buddy pages returned by expand() are received here in * physical page order. The page is added to the tail of @@ -2158,7 +2171,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, -(1 << order)); } - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + __mod_zone_page_state(zone, NR_FREE_PAGES, -((i - last_mod) << order)); spin_unlock_irqrestore(&zone->lock, flags); return i; From 0ef35bd766be6bb2d6f5dc6dc6580719f47e1445 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sun, 19 Apr 2020 19:59:18 -0700 Subject: [PATCH 56/57] ZEN: mm: Stop kswapd early when nothing's waiting for it to free pages Contains: - mm: Stop kswapd early when nothing's waiting for it to free pages Keeping kswapd running when all the failed allocations that invoked it are satisfied incurs a high overhead due to unnecessary page eviction and writeback, as well as spurious VM pressure events to various registered shrinkers. When kswapd doesn't need to work to make an allocation succeed anymore, stop it prematurely to save resources. Signed-off-by: Sultan Alsawaf - mm: Don't stop kswapd on a per-node basis when there are no waiters The page allocator wakes all kswapds in an allocation context's allowed nodemask in the slow path, so it doesn't make sense to have the kswapd- waiter count per each NUMA node. Instead, it should be a global counter to stop all kswapds when there are no failed allocation requests. Signed-off-by: Sultan Alsawaf - mm: Increment kswapd_waiters for throttled direct reclaimers Throttled direct reclaimers will wake up kswapd and wait for kswapd to satisfy their page allocation request, even when the failed allocation lacks the __GFP_KSWAPD_RECLAIM flag in its gfp mask. As a result, kswapd may think that there are no waiters and thus exit prematurely, causing throttled direct reclaimers lacking __GFP_KSWAPD_RECLAIM to stall on waiting for kswapd to wake them up. Incrementing the kswapd_waiters counter when such direct reclaimers become throttled fixes the problem. Signed-off-by: Sultan Alsawaf Signed-off-by: Kai Krakow --- mm/internal.h | 1 + mm/page_alloc.c | 17 ++++++++++++++--- mm/vmscan.c | 20 ++++++++++++++------ 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index abed947f784b7b..3d0c9199cc6ab4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -439,6 +439,7 @@ extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern atomic_long_t kswapd_waiters; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 817189eff88e23..a92c712db2a0b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -85,6 +85,8 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0); + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -3933,6 +3935,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + bool woke_kswapd = false; restart: compaction_retries = 0; @@ -3972,8 +3975,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; } - if (alloc_flags & ALLOC_KSWAPD) + if (alloc_flags & ALLOC_KSWAPD) { + if (!woke_kswapd) { + atomic_long_inc(&kswapd_waiters); + woke_kswapd = true; + } wake_all_kswapds(order, gfp_mask, ac); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4189,9 +4197,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto retry; } fail: - warn_alloc(gfp_mask, ac->nodemask, - "page allocation failure: order:%u", order); got_pg: + if (woke_kswapd) + atomic_long_dec(&kswapd_waiters); + if (!page) + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); return page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index e9e2cf830daa1d..60d2e009825f16 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6947,7 +6947,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, return 0; } -static bool allow_direct_reclaim(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -6976,6 +6976,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) wmark_ok = free_pages > pfmemalloc_reserve / 2; + /* The throttled direct reclaimer is now a kswapd waiter */ + if (unlikely(!using_kswapd && !wmark_ok)) + atomic_long_inc(&kswapd_waiters); + /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) @@ -7041,7 +7045,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (allow_direct_reclaim(pgdat)) + if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM)) goto out; break; } @@ -7063,11 +7067,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat), HZ); + allow_direct_reclaim(pgdat, true), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); + allow_direct_reclaim(pgdat, true)); + + if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM))) + atomic_long_dec(&kswapd_waiters); if (fatal_signal_pending(current)) return true; @@ -7567,14 +7574,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - allow_direct_reclaim(pgdat)) + allow_direct_reclaim(pgdat, true)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); __fs_reclaim_acquire(_THIS_IP_); - if (ret || kthread_should_stop()) + if (ret || kthread_should_stop() || + !atomic_long_read(&kswapd_waiters)) break; /* From d29edf912f5a2bb2bcb8b0c8e19eb2efb017f5dc Mon Sep 17 00:00:00 2001 From: Kenny Levinsen Date: Sun, 27 Dec 2020 14:43:13 +0000 Subject: [PATCH 57/57] ZEN: Input: evdev - use call_rcu when detaching client Significant time was spent on synchronize_rcu in evdev_detach_client when applications closed evdev devices. Switching VT away from a graphical environment commonly leads to mass input device closures, which could lead to noticable delays on systems with many input devices. Replace synchronize_rcu with call_rcu, deferring reclaim of the evdev client struct till after the RCU grace period instead of blocking the calling application. While this does not solve all slow evdev fd closures, it takes care of a good portion of them, including this simple test: #include #include int main(int argc, char *argv[]) { int idx, fd; const char *path = "/dev/input/event0"; for (idx = 0; idx < 1000; idx++) { if ((fd = open(path, O_RDWR)) == -1) { return -1; } close(fd); } return 0; } Time to completion of above test when run locally: Before: 0m27.111s After: 0m0.018s Signed-off-by: Kenny Levinsen --- drivers/input/evdev.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 95f90699d2b17b..2b10fe29d2c8d9 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -46,6 +46,7 @@ struct evdev_client { struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; + struct rcu_head rcu; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; @@ -377,13 +378,22 @@ static void evdev_attach_client(struct evdev *evdev, spin_unlock(&evdev->client_lock); } +static void evdev_reclaim_client(struct rcu_head *rp) +{ + struct evdev_client *client = container_of(rp, struct evdev_client, rcu); + unsigned int i; + for (i = 0; i < EV_CNT; ++i) + bitmap_free(client->evmasks[i]); + kvfree(client); +} + static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); - synchronize_rcu(); + call_rcu(&client->rcu, evdev_reclaim_client); } static int evdev_open_device(struct evdev *evdev) @@ -436,7 +446,6 @@ static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; - unsigned int i; mutex_lock(&evdev->mutex); @@ -448,11 +457,6 @@ static int evdev_release(struct inode *inode, struct file *file) evdev_detach_client(evdev, client); - for (i = 0; i < EV_CNT; ++i) - bitmap_free(client->evmasks[i]); - - kvfree(client); - evdev_close_device(evdev); return 0; @@ -495,7 +499,6 @@ static int evdev_open(struct inode *inode, struct file *file) err_free_client: evdev_detach_client(evdev, client); - kvfree(client); return error; }