From 01068a065edccb36568d7fdc0365b2c701c7781c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 23 Jun 2025 13:56:36 +0200 Subject: [PATCH 01/12] btrfs: avoid RCU stall during backref list construction The following kernel message may be logged if `add_inline_refs()` or `add_keyed_refs()` block for too long: > kernel: rcu: INFO: rcu_sched self-detected stall on CPU > kernel: rcu: 10-....: (2100 ticks this GP) idle=0494/1/0x4000000000000000 softirq=164826140/164826187 fqs=1052 > kernel: rcu: (t=2100 jiffies g=358306033 q=2241752 ncpus=16) > kernel: CPU: 10 UID: 0 PID: 1524681 Comm: map_0x178e45670 Not tainted 6.12.21-gentoo #1 > kernel: Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 > kernel: RIP: 0010:btrfs_get_64+0x65/0x110 > kernel: Code: d3 ed 48 8b 4f 70 48 8b 31 83 e6 40 74 11 0f b6 49 40 41 bc 00 10 00 00 49 d3 e4 49 83 ec 01 4a 8b 5c ed 70 49 21 d4 45 89 c9 <48> 2b 1d 7c 99 09 01 49 01 c1 8b 55 08 49 8d 49 08 44 8b 75 0c 48 > kernel: RSP: 0018:ffffbb7ad531bba0 EFLAGS: 00000202 > kernel: RAX: 0000000000001f15 RBX: fffff437ea382200 RCX: fffff437cb891200 > kernel: RDX: 000001922b68df2a RSI: 0000000000000000 RDI: ffffa434c3e66d20 > kernel: RBP: ffffa434c3e66d20 R08: 000001922b68c000 R09: 0000000000000015 > kernel: R10: 6c0000000000000a R11: 0000000009fe7000 R12: 0000000000000f2a > kernel: R13: 0000000000000001 R14: ffffa43192e6d230 R15: ffffa43160c4c800 > kernel: FS: 000055d07085e6c0(0000) GS:ffffa4452bc80000(0000) knlGS:0000000000000000 > kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > kernel: CR2: 00007fff204ecfc0 CR3: 0000000121a0b000 CR4: 00000000001506f0 > kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > kernel: Call Trace: > kernel: > kernel: ? rcu_dump_cpu_stacks+0xd3/0x100 > kernel: ? rcu_sched_clock_irq+0x4ff/0x920 > kernel: ? update_process_times+0x6c/0xa0 > kernel: ? tick_nohz_handler+0x82/0x110 > kernel: ? tick_do_update_jiffies64+0xd0/0xd0 > kernel: ? __hrtimer_run_queues+0x10b/0x190 > kernel: ? hrtimer_interrupt+0xf1/0x200 > kernel: ? __sysvec_apic_timer_interrupt+0x44/0x50 > kernel: ? sysvec_apic_timer_interrupt+0x60/0x80 > kernel: > kernel: > kernel: ? asm_sysvec_apic_timer_interrupt+0x16/0x20 > kernel: ? btrfs_get_64+0x65/0x110 > kernel: find_parent_nodes+0x1b84/0x1dc0 > kernel: btrfs_find_all_leafs+0x31/0xd0 > kernel: ? queued_write_lock_slowpath+0x30/0x70 > kernel: iterate_extent_inodes+0x6f/0x370 > kernel: ? update_share_count+0x60/0x60 > kernel: ? extent_from_logical+0x139/0x190 > kernel: ? release_extent_buffer+0x96/0xb0 > kernel: iterate_inodes_from_logical+0xaa/0xd0 > kernel: btrfs_ioctl_logical_to_ino+0xaa/0x150 > kernel: __x64_sys_ioctl+0x84/0xc0 > kernel: do_syscall_64+0x47/0x100 > kernel: entry_SYSCALL_64_after_hwframe+0x4b/0x53 > kernel: RIP: 0033:0x55d07617eaaf > kernel: Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 18 48 8b 44 24 18 64 48 2b 04 25 28 00 00 > kernel: RSP: 002b:000055d07085bc20 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 > kernel: RAX: ffffffffffffffda RBX: 000055d0402f8550 RCX: 000055d07617eaaf > kernel: RDX: 000055d07085bca0 RSI: 00000000c038943b RDI: 0000000000000003 > kernel: RBP: 000055d07085bea0 R08: 00007fee46c84080 R09: 0000000000000000 > kernel: R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 > kernel: R13: 000055d07085bf80 R14: 000055d07085bf48 R15: 000055d07085c0b0 > kernel: The RCU stall could be because there's a large number of backrefs for some extents and we're spending too much time looping over them without ever yielding the cpu. Avoid the stall warning by adding `conf_resched()`. Link: https://lore.kernel.org/linux-btrfs/CAMthOuP_AE9OwiTQCrh7CK73xdTZvHsLTB1JU2WBK6cCc05JYg@mail.gmail.com/T/#md2e3504a1885c63531f8eefc70c94cff571b7a72 Signed-off-by: Kai Krakow --- fs/btrfs/backref.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2ab550a1e715a7..8895a2b446eb5d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1126,6 +1126,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, if (ret) return ret; ptr += btrfs_extent_inline_ref_size(type); + cond_resched(); } return 0; @@ -1229,7 +1230,7 @@ static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, } if (ret) return ret; - + cond_resched(); } return ret; From 8ff33f96d6a938a1a3d7abe4567e1604e745e590 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 17:01:09 +0100 Subject: [PATCH 02/12] btrfs: add new Kconfig option for btrfs allocator hints Signed-off-by: Kai Krakow --- fs/btrfs/Kconfig | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4438637c8900cd..940a24b3402598 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -84,6 +84,51 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_ALLOCATOR_HINTS + bool "Btrfs allocator hints" + depends on BTRFS_FS + default n + help + Enable support for allocator hints. This feature allows to select + dedicated or preferred devices for meta data vs data, or prevent + allocation from a device at all. This feature does not interact + well with free space calculation because the formula expects to + allocate space always from a device with most free space which is + not true when hints are applied. It may also create issues if a + device from the pool dies resulting in a situation where there are + still enough RAID mirror members but the allocation hints don't + allow to allocate from specific devices. + + You are advised to watch your free space closely with btrfs tools + instead of relying on df only. + + Mounting a btrfs with this feature on or off is always possible, + there are no incompatible changes to the file system. But running + without this feature may place new chunks on unwanted devices and + you may want to clean up later by balancing the affected chunks. + + Supported hint types in /sys/fs/btrfs/BTRFS-UUID/devinfo/ID/type: + + - type = 0 - allocate data chunks from this ID first (recommended + for big disks with good sequential performance, e.g. + HDDs), prefers data on this device + - type = 1 - allocate meta data chunks from this ID first + (recommended for fast and small disks with good + latency, e.g. SSD/NVMe), prefers meta data on this + device + - type = 2 - allocate only meta data chunks from this ID, no data + chunks will ever be allocated from this device + - type = 3 - allocate only data chunks from this ID, no meta data + chunks will ever be allocated from this device + - type = 4 - allocate any chunks from this device last, will never + allocate any space from this device unless there isn't + enough space on other devices + - type = 5 - never allocate any new chunks, useful when putting a + device out of use and to avoid redundant chunk writes + during balance/replace + + If unsure, say N. + config BTRFS_EXPERIMENTAL bool "Btrfs experimental features" depends on BTRFS_FS From 9b28489221e3613f2d188c5378d9bfe6b8551880 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 17:05:53 +0100 Subject: [PATCH 03/12] btrfs: add flags to give a hint to the chunk allocator Add the following flags to give a hint about which chunk should be allocated on which a disk. The following flags are created: - BTRFS_DEV_ALLOCATION_PREFERRED_DATA preferred data chunk, but metadata chunk allowed - BTRFS_DEV_ALLOCATION_PREFERRED_METADATA preferred metadata chunk, but data chunk allowed - BTRFS_DEV_ALLOCATION_METADATA_ONLY only metadata chunk allowed - BTRFS_DEV_ALLOCATION_DATA_ONLY only data chunk allowed Co-authored-by: Goffredo Baroncelli Signed-off-by: Kai Krakow --- include/uapi/linux/btrfs_tree.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d84..bed65f91c67821 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -578,6 +578,23 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS +/* dev_item.type */ + +/* btrfs chunk allocation hints */ +#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3 +/* preferred data chunk, but metadata chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL) +/* preferred metadata chunk, but data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL) +/* only metadata chunk allowed */ +#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) +/* only data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) +/* 5..7 are unused values */ + +#endif + struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid; From 34ed0e489aeaeab1a6b6f7df0e8589b18e1a3f13 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 17:12:14 +0100 Subject: [PATCH 04/12] btrfs: export dev_item.type in /sys/fs/btrfs//devinfo//type Co-authored-by: Goffredo Baroncelli Signed-off-by: Kai Krakow --- fs/btrfs/sysfs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 81f52c1f55ce57..fbce70fa37bf82 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -2140,6 +2140,18 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS +static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); +} +BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); +#endif + /* * Information about one device. * @@ -2153,6 +2165,9 @@ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, replace_target), BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + BTRFS_ATTR_PTR(devid, type), +#endif NULL }; ATTRIBUTE_GROUPS(devid); From 42e48fd39c87d540a7b72e2f3133c135e1e97ba0 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 21:18:00 +0100 Subject: [PATCH 05/12] btrfs: change the DEV_ITEM 'type' field via sysfs v2: Adds a check to prevent modification while the file system is still mounting. Todo: - Transactions should not be triggered from sysfw writes, see: https://lore.kernel.org/linux-btrfs/20251213200920.1808679-1-kai@kaishome.de/ Link: https://github.com/kakra/linux/pull/36#issuecomment-3406301805 Reported-by: Eli Venter Co-authored-by: Goffredo Baroncelli Signed-off-by: Kai Krakow --- fs/btrfs/sysfs.c | 64 +++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 2 ++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fbce70fa37bf82..dfdfeda7c50393 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -2149,7 +2149,69 @@ static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); } -BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); + +static ssize_t btrfs_devinfo_type_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_device *device; + int ret; + struct btrfs_trans_handle *trans; + + u64 type, prev_type; + + device = container_of(kobj, struct btrfs_device, devid_kobj); + fs_info = device->fs_info; + if (!fs_info) + return -EPERM; + + /* + * Changing the type field requires starting a transaction which will cause a NULL derefernce in + * __reserve_bytes if the file system is not fully open. Thus, return EBUSY if the file system is not fully + * initialized. + */ + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return -EBUSY; + + root = fs_info->chunk_root; + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtou64(buf, 0, &type); + if (ret < 0) + return -EINVAL; + + /* for now, only allow touching the 'allocation hint' bits */ + if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)) + return -EINVAL; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + prev_type = device->type; + device->type = type; + + ret = btrfs_update_device(trans, device); + + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto abort; + } + + ret = btrfs_commit_transaction(trans); + if (ret < 0) + goto abort; + + return len; +abort: + device->type = prev_type; + return ret; +} +BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store); #endif /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba300..8843578e1b4e04 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2942,7 +2942,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; } -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, +noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2cbf8080eade06..639aeaecc11644 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -890,6 +890,8 @@ int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); From 3212d021a19de63875b3483e4d27d515e073263c Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 17:24:36 +0100 Subject: [PATCH 06/12] btrfs: add allocator_hint mode When this mode is enabled, the chunk allocation policy is modified as follows: Each disk may have a different tag: - BTRFS_DEV_ALLOCATION_PREFERRED_METADATA - BTRFS_DEV_ALLOCATION_METADATA_ONLY - BTRFS_DEV_ALLOCATION_DATA_ONLY - BTRFS_DEV_ALLOCATION_PREFERRED_DATA (default) Where: - ALLOCATION_PREFERRED_X means that it is preferred to use this disk for the X chunk type (the other type may be allowed when the space is low) - ALLOCATION_X_ONLY means that it is used *only* for the X chunk type. This means also that it is a preferred choice. Each time the allocator allocates a chunk of type X, first it takes the disks tagged as ALLOCATION_X_ONLY or ALLOCATION_PREFERRED_X. If the space is not enough, it uses also the disks tagged as ALLOCATION_METADATA_ONLY. If the space is not enough, it uses also the other disks, with the exception of the one marked as ALLOCATION_PREFERRED_Y, where Y is the other type of chunk (i.e. not X). Co-authored-by: Goffredo Baroncelli Signed-off-by: Kai Krakow --- fs/btrfs/volumes.c | 105 ++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 3 ++ 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8843578e1b4e04..ee94e02013849f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -184,6 +184,21 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags return BTRFS_BG_FLAG_TO_INDEX(profile); } +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS +#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1) +#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) + +static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, + [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, + /* the other values are set to 0 */ +}; +#endif + const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); @@ -5089,13 +5104,20 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, } /* - * sort the devices in descending order by max_avail, total_avail + * sort the devices in descending order by alloc_hint (optional), + * max_avail, total_avail */ static int btrfs_cmp_device_info(const void *a, const void *b) { const struct btrfs_device_info *di_a = a; const struct btrfs_device_info *di_b = b; +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + if (di_a->alloc_hint > di_b->alloc_hint) + return -1; + if (di_a->alloc_hint < di_b->alloc_hint) + return 1; +#endif if (di_a->max_avail > di_b->max_avail) return -1; if (di_a->max_avail < di_b->max_avail) @@ -5303,16 +5325,97 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; + +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && + (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { + /* + * if mixed bg set all the alloc_hint + * fields to the same value, so the sorting + * is not affected + */ + devices_info[ndevs].alloc_hint = 0; + } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { + int hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_METADATA_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_METADATA_ONLY == hint) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (data disk + * higher priority) + */ + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + } else { /* BTRFS_BLOCK_GROUP_METADATA */ + int hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_DATA_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_DATA_ONLY == hint) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (metadata hint + * higher priority) + */ + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + } +#endif + ++ndevs; } ctl->ndevs = ndevs; +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + /* + * no devices available + */ + if (!ndevs) + return 0; +#endif + /* * now sort the devices by hole size / available space */ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + /* + * select the minimum set of disks grouped by hint that + * can host the chunk + */ + ndevs = 0; + while (ndevs < ctl->ndevs) { + int hint = devices_info[ndevs++].alloc_hint; + while (ndevs < ctl->ndevs && + devices_info[ndevs].alloc_hint == hint) + ndevs++; + if (ndevs >= ctl->devs_min) + break; + } + + BUG_ON(ndevs > ctl->ndevs); + ctl->ndevs = ndevs; + + /* + * the next layers require the devices_info ordered by + * max_avail. If we are returing two (or more) different + * group of alloc_hint, this is not always true. So sort + * these gain. + */ + + for (int i = 0 ; i < ndevs ; i++) + devices_info[i].alloc_hint = 0; + + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); +#endif + return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 639aeaecc11644..d7a9a5bc20717b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -599,6 +599,9 @@ struct btrfs_device_info { u64 dev_offset; u64 max_avail; u64 total_avail; +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + int alloc_hint; +#endif }; struct btrfs_raid_attr { From d9b8b0c82aefcd3855ef8aa7fec8f544fac99aca Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Thu, 27 Jun 2024 20:05:58 +0200 Subject: [PATCH 07/12] btrfs: add allocator_hint for no allocation preferred This is useful where you want to prevent new allocations of chunks on a disk which is going to be removed from the pool anyways, e.g. due to bad blocks or because it's slow. Signed-off-by: Kai Krakow --- fs/btrfs/volumes.c | 6 +++++- include/uapi/linux/btrfs_tree.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ee94e02013849f..394957d5e850fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -195,6 +195,7 @@ static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, + [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99, /* the other values are set to 0 */ }; #endif @@ -5362,7 +5363,10 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, * sort also by hint (metadata hint * higher priority) */ - devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + if (BTRFS_DEV_ALLOCATION_PREFERRED_NONE == hint) + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + else + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; } #endif diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index bed65f91c67821..8ed556bd2d5917 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -591,6 +591,8 @@ struct btrfs_node { #define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) /* only data chunk allowed */ #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) +/* avoid chunk allocation if possible */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) /* 5..7 are unused values */ #endif From f45c4e838ce4f7f98563b797f3f09fd0068641ea Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Fri, 6 Dec 2024 00:55:31 +0100 Subject: [PATCH 08/12] btrfs: add allocator_hint to disable allocation completely This is useful where you want to prevent new allocations of chunks to a set of multiple disks which are going to be removed from the pool. This acts as a multiple `btrfs dev remove` on steroids that can remove multiple disks in parallel without moving data to disks which would be removed in the next round. In such cases, it will avoid moving the same data multiple times, and thus avoid placing it on potentially bad disks. Thanks to @Zygo for the explanation and suggestion. Link: https://github.com/kdave/btrfs-progs/issues/907#issuecomment-2520897104 Signed-off-by: Kai Krakow --- fs/btrfs/volumes.c | 11 +++++++++++ include/uapi/linux/btrfs_tree.h | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 394957d5e850fa..815292712a243e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -191,6 +191,7 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_NONE_ONLY] = -99, [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, @@ -5344,6 +5345,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, */ if (BTRFS_DEV_ALLOCATION_METADATA_ONLY == hint) continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_NONE_ONLY == hint) + continue; /* * if a data chunk must be allocated, * sort also by hint (data disk @@ -5358,6 +5364,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, */ if (BTRFS_DEV_ALLOCATION_DATA_ONLY == hint) continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_NONE_ONLY == hint) + continue; /* * if a data chunk must be allocated, * sort also by hint (metadata hint diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 8ed556bd2d5917..2c8f407be8a442 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -593,7 +593,9 @@ struct btrfs_node { #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) /* avoid chunk allocation if possible */ #define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) -/* 5..7 are unused values */ +/* deny chunk allocation */ +#define BTRFS_DEV_ALLOCATION_NONE_ONLY (5ULL) +/* 6..7 are unused values */ #endif From 8305177a0ea90dff5a2263aa11de061054e00ae2 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 17:50:16 +0100 Subject: [PATCH 09/12] btrfs: add io read stats per device to devinfo This adds read stats per device to devinfo to evaluate the effects of different read policies better. This adds a new file /sys/fs/btrfs/BTRFS-UUID/devinfo/ID/read_stats. Signed-off-by: Kai Krakow --- fs/btrfs/Kconfig | 12 ++++++++++++ fs/btrfs/sysfs.c | 28 ++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 30 +++++++++++++++++++++++++++--- fs/btrfs/volumes.h | 8 ++++++++ 4 files changed, 75 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 940a24b3402598..888561b2a9ed79 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -129,6 +129,18 @@ config BTRFS_ALLOCATOR_HINTS If unsure, say N. +config BTRFS_PER_DEVICE_IO_STATS + bool "Btrfs per io devices stats" + depends on BTRFS_FS + default n + help + Enable collecting io read stats per devices to evaluate the effects + of different read policies better. + + This adds a new file /sys/fs/btrfs/BTRFS-UUID/devinfo/ID/read_stats. + + If unsure, say N. + config BTRFS_EXPERIMENTAL bool "Btrfs experimental features" depends on BTRFS_FS diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index dfdfeda7c50393..d8eac006904b6b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -26,6 +26,10 @@ #include "fs.h" #include "accessors.h" +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS +#include +#endif + /* * Structure name Path * -------------------------------------------------------------------------- @@ -2214,12 +2218,36 @@ static ssize_t btrfs_devinfo_type_store(struct kobject *kobj, BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store); #endif +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS +static ssize_t btrfs_devinfo_read_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + u64 read_wait = part_stat_read(device->bdev, nsecs[READ]); + unsigned long read_ios = part_stat_read(device->bdev, ios[READ]); + + u64 avg_wait = 0; + if (read_wait && read_ios && read_wait >= read_ios) + avg_wait = div_u64(read_wait, read_ios); + + return scnprintf(buf, PAGE_SIZE, "ios %lu wait %llu avg %llu age %llu ignored %llu\n", + read_ios, read_wait, avg_wait, + (u64)atomic64_read(&device->last_io_age), + (u64)atomic64_read(&device->stripe_ignored)); +} +BTRFS_ATTR(devid, read_stats, btrfs_devinfo_read_stats_show); +#endif + /* * Information about one device. * * Path: /sys/fs/btrfs//devinfo// */ static struct attribute *devid_attrs[] = { +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + BTRFS_ATTR_PTR(devid, read_stats), +#endif BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 815292712a243e..70cebae42de356 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6139,6 +6139,15 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + /* age each possible stripe by 1 IO */ + for (int i = first; i < first + num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + atomic64_inc(&device->last_io_age); + atomic64_inc(&device->stripe_ignored); + } +#endif + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ @@ -6174,14 +6183,29 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, for (tolerance = 0; tolerance < 2; tolerance++) { if (map->stripes[preferred_mirror].dev->bdev && (tolerance || map->stripes[preferred_mirror].dev != srcdev)) - return preferred_mirror; + goto out; for (i = first; i < first + num_stripes; i++) { if (map->stripes[i].dev->bdev && - (tolerance || map->stripes[i].dev != srcdev)) - return i; + (tolerance || map->stripes[i].dev != srcdev)) { + preferred_mirror = i; + goto out; + } } } +out: +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + do { + struct btrfs_device *preferred_device = map->stripes[preferred_mirror].dev; + + /* reset age of selected stripe */ + atomic64_set(&preferred_device->last_io_age, 0); + + /* do not count ignores for the selected stripe */ + atomic64_dec(&preferred_device->stripe_ignored); + } while (0); +#endif + /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d7a9a5bc20717b..c7623e2e3691b3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -201,6 +201,14 @@ struct btrfs_device { /* Bandwidth limit for scrub, in bytes */ u64 scrub_speed_max; + +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + /* store an age of last read access */ + atomic64_t last_io_age; + + /* store how often a stripe has been ignored as a read candidate */ + atomic64_t stripe_ignored; +#endif }; /* From 4a6d0120d8031af45ca375a9102c34573a757ca4 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 18:34:36 +0100 Subject: [PATCH 10/12] btrfs: move read policies out of experimental Read policies seem safe and stable enough to move it out of the experimental feature set. This allows us to add more policies without forcing users to enable the full experimental feature set. Signed-off-by: Kai Krakow --- fs/btrfs/Kconfig | 15 +++++++++++++++ fs/btrfs/super.c | 4 ++-- fs/btrfs/sysfs.c | 12 ++++++------ fs/btrfs/sysfs.h | 2 +- fs/btrfs/volumes.c | 6 +++--- fs/btrfs/volumes.h | 6 ++++-- 6 files changed, 31 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 888561b2a9ed79..e30e5f153d7bfd 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -141,6 +141,21 @@ config BTRFS_PER_DEVICE_IO_STATS If unsure, say N. +config BTRFS_READ_POLICIES + bool "Btrfs read policies" + depends on BTRFS_FS + default n + help + This enables btrfs read policies to control how btrfs selects stripes + from a mirror during read operations. This was originally part of + the experimental feature set but it is safe to use and can provide + huge performance benefits in certain scenarios without causing any + performance regressions. + + This adds a new file /sys/fs/btrfs/BTRFS-UUID/read_policy. + + If unsure, say N. + config BTRFS_EXPERIMENTAL bool "Btrfs experimental features" depends on BTRFS_FS diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 430e7419349c9a..2851e6eb61eb1d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2498,7 +2498,7 @@ static int __init btrfs_print_mod_info(void) #endif ; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES if (btrfs_get_mod_read_policy() == NULL) pr_info("Btrfs loaded%s\n", options); else @@ -2565,7 +2565,7 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = btrfs_extent_map_init, .exit_func = btrfs_extent_map_exit, -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES }, { .init_func = btrfs_read_policy_init, .exit_func = NULL, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index d8eac006904b6b..f9fe6a27509bc8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1323,13 +1323,13 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char *btrfs_read_policy_name[] = { "pid", -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES "round-robin", "devid", #endif }; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* Global module configuration parameters. */ static char *read_policy; @@ -1354,7 +1354,7 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) strscpy(param, str); -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { @@ -1376,7 +1376,7 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) return sysfs_match_string(btrfs_read_policy_name, param); } -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES int __init btrfs_read_policy_init(void) { s64 value; @@ -1407,7 +1407,7 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES if (i == BTRFS_READ_POLICY_RR) ret += sysfs_emit_at(buf, ret, ":%u", READ_ONCE(fs_devices->rr_min_contig_read)); @@ -1437,7 +1437,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, if (index < 0) return -EINVAL; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* If moving from RR then disable collecting fs stats. */ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) fs_devices->collect_fs_stats = false; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 0f94ae9232101b..ef1bd5024be4d4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -50,7 +50,7 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); int btrfs_read_policy_to_enum(const char *str, s64 *value); -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES int __init btrfs_read_policy_init(void); char *btrfs_get_mod_read_policy(void); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 70cebae42de356..667c96ebfd0141 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1282,7 +1282,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; fs_devices->read_devid = latest_dev->devid; fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), @@ -6050,7 +6050,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) { for (int index = first; index < first + num_stripes; index++) { @@ -6158,7 +6158,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES case BTRFS_READ_POLICY_RR: preferred_mirror = btrfs_read_rr(map, first, num_stripes); break; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c7623e2e3691b3..af1ccc2a3006e2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -318,7 +318,7 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* Balancing RAID1 reads across all striped devices (round-robin). */ BTRFS_READ_POLICY_RR, /* Read from a specific device. */ @@ -463,7 +463,7 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* * Minimum contiguous reads before switching to next device, the unit * is one block/sectorsize. @@ -472,7 +472,9 @@ struct btrfs_fs_devices { /* Device to be used for reading in case of RAID1. */ u64 read_devid; +#endif +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif From 6137992d4083ed4c5b50914a5bb1a8a9058b4169 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Fri, 25 Apr 2025 21:46:52 +0200 Subject: [PATCH 11/12] btrfs: add in-flight queue read policy Select the preferred stripe based on the mirror with the least in-flight requests. Signed-off-by: Kai Krakow --- fs/btrfs/sysfs.c | 3 ++- fs/btrfs/volumes.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 2 ++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f9fe6a27509bc8..9263125a064cd3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1325,6 +1325,7 @@ static const char *btrfs_read_policy_name[] = { "pid", #ifdef CONFIG_BTRFS_READ_POLICIES "round-robin", + "queue", "devid", #endif }; @@ -1341,7 +1342,7 @@ char *btrfs_get_mod_read_policy(void) /* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ module_param(read_policy, charp, 0); MODULE_PARM_DESC(read_policy, -"Global read policy: pid (default), round-robin[:], devid[:]"); +"Global read policy: pid (default), round-robin[:], queue, devid[:]"); #endif int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 667c96ebfd0141..69e24f5b7d4705 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,10 @@ #include "super.h" #include "raid-stripe-tree.h" +#ifdef CONFIG_BTRFS_READ_POLICIES +#include +#endif + #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -6051,6 +6055,44 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, } #ifdef CONFIG_BTRFS_READ_POLICIES +static unsigned int part_in_flight(struct block_device *part) +{ + unsigned int inflight = 0; + int cpu; + + for_each_possible_cpu(cpu) { + inflight += part_stat_local_read_cpu(part, in_flight[READ], cpu) + + part_stat_local_read_cpu(part, in_flight[WRITE], cpu); + } + if ((int)inflight < 0) + inflight = 0; + + return inflight; +} + +/* + * btrfs_earliest_stripe + * + * Select a stripe from the device with shortest in-flight requests. + */ +static int btrfs_read_earliest(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, int first, + int num_stripes) +{ + u64 best_in_flight = U64_MAX; + int best_stripe = 0; + + for (int index = first; index < first + num_stripes; index++) { + u64 in_flight = part_in_flight(map->stripes[index].dev->bdev); + if (best_in_flight > in_flight) { + best_in_flight = in_flight; + best_stripe = index; + } + } + + return best_stripe; +} + static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) { for (int index = first; index < first + num_stripes; index++) { @@ -6162,6 +6204,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_RR: preferred_mirror = btrfs_read_rr(map, first, num_stripes); break; + case BTRFS_READ_POLICY_QUEUE: + preferred_mirror = btrfs_read_earliest(fs_info, map, first, + num_stripes); + break; case BTRFS_READ_POLICY_DEVID: preferred_mirror = btrfs_read_preferred(map, first, num_stripes); break; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index af1ccc2a3006e2..1d02a79e7b91a6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -321,6 +321,8 @@ enum btrfs_read_policy { #ifdef CONFIG_BTRFS_READ_POLICIES /* Balancing RAID1 reads across all striped devices (round-robin). */ BTRFS_READ_POLICY_RR, + /* Read from the device with the least in-flight requests */ + BTRFS_READ_POLICY_QUEUE, /* Read from a specific device. */ BTRFS_READ_POLICY_DEVID, #endif From 6d23e96eeae7587d380a8968e9f438bb76cef7a4 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Wed, 17 Dec 2025 14:01:08 +0100 Subject: [PATCH 12/12] GITHUB: btrfs: add build workflow --- .github/workflows/makefile.yml | 34 ++++++++++++++++++++++++++++++++++ .gitignore | 3 +++ 2 files changed, 37 insertions(+) create mode 100644 .github/workflows/makefile.yml diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml new file mode 100644 index 00000000000000..e1a5d4bd88c467 --- /dev/null +++ b/.github/workflows/makefile.yml @@ -0,0 +1,34 @@ +name: Makefile CI + +on: + push: + branches: + - 'rebase-*/btrfs-patches' + pull_request: + branches: + - 'rebase-*/btrfs-patches' + workflow_dispatch: + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Configure minimal kernel + run: make tinyconfig + + - name: Configure btrfs + run: | + echo "CONFIG_BLOCK=y" >>.config + echo "CONFIG_BTRFS_FS=y" >>.config + echo "CONFIG_BTRFS_FS_POSIX_ACL=y" >>.config + echo "CONFIG_BTRFS_ALLOCATOR_HINTS=y" >>.config + echo "CONFIG_BTRFS_PER_DEVICE_IO_STATS=y" >>.config + echo "CONFIG_BTRFS_READ_POLICIES=y" >>.config + make oldconfig + + - name: Compile kernel + run: make -j$(nproc) all diff --git a/.gitignore b/.gitignore index 86a1ba0d903539..d09ac6e2f574a3 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,6 @@ sphinx_*/ # Rust analyzer configuration /rust-project.json + +# Allow Github workflows +!/.github