diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml new file mode 100644 index 00000000000000..e1a5d4bd88c467 --- /dev/null +++ b/.github/workflows/makefile.yml @@ -0,0 +1,34 @@ +name: Makefile CI + +on: + push: + branches: + - 'rebase-*/btrfs-patches' + pull_request: + branches: + - 'rebase-*/btrfs-patches' + workflow_dispatch: + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Configure minimal kernel + run: make tinyconfig + + - name: Configure btrfs + run: | + echo "CONFIG_BLOCK=y" >>.config + echo "CONFIG_BTRFS_FS=y" >>.config + echo "CONFIG_BTRFS_FS_POSIX_ACL=y" >>.config + echo "CONFIG_BTRFS_ALLOCATOR_HINTS=y" >>.config + echo "CONFIG_BTRFS_PER_DEVICE_IO_STATS=y" >>.config + echo "CONFIG_BTRFS_READ_POLICIES=y" >>.config + make oldconfig + + - name: Compile kernel + run: make -j$(nproc) all diff --git a/.gitignore b/.gitignore index 86a1ba0d903539..d09ac6e2f574a3 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,6 @@ sphinx_*/ # Rust analyzer configuration /rust-project.json + +# Allow Github workflows +!/.github diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4438637c8900cd..e30e5f153d7bfd 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -84,6 +84,78 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_ALLOCATOR_HINTS + bool "Btrfs allocator hints" + depends on BTRFS_FS + default n + help + Enable support for allocator hints. This feature allows to select + dedicated or preferred devices for meta data vs data, or prevent + allocation from a device at all. This feature does not interact + well with free space calculation because the formula expects to + allocate space always from a device with most free space which is + not true when hints are applied. It may also create issues if a + device from the pool dies resulting in a situation where there are + still enough RAID mirror members but the allocation hints don't + allow to allocate from specific devices. + + You are advised to watch your free space closely with btrfs tools + instead of relying on df only. + + Mounting a btrfs with this feature on or off is always possible, + there are no incompatible changes to the file system. But running + without this feature may place new chunks on unwanted devices and + you may want to clean up later by balancing the affected chunks. + + Supported hint types in /sys/fs/btrfs/BTRFS-UUID/devinfo/ID/type: + + - type = 0 - allocate data chunks from this ID first (recommended + for big disks with good sequential performance, e.g. + HDDs), prefers data on this device + - type = 1 - allocate meta data chunks from this ID first + (recommended for fast and small disks with good + latency, e.g. SSD/NVMe), prefers meta data on this + device + - type = 2 - allocate only meta data chunks from this ID, no data + chunks will ever be allocated from this device + - type = 3 - allocate only data chunks from this ID, no meta data + chunks will ever be allocated from this device + - type = 4 - allocate any chunks from this device last, will never + allocate any space from this device unless there isn't + enough space on other devices + - type = 5 - never allocate any new chunks, useful when putting a + device out of use and to avoid redundant chunk writes + during balance/replace + + If unsure, say N. + +config BTRFS_PER_DEVICE_IO_STATS + bool "Btrfs per io devices stats" + depends on BTRFS_FS + default n + help + Enable collecting io read stats per devices to evaluate the effects + of different read policies better. + + This adds a new file /sys/fs/btrfs/BTRFS-UUID/devinfo/ID/read_stats. + + If unsure, say N. + +config BTRFS_READ_POLICIES + bool "Btrfs read policies" + depends on BTRFS_FS + default n + help + This enables btrfs read policies to control how btrfs selects stripes + from a mirror during read operations. This was originally part of + the experimental feature set but it is safe to use and can provide + huge performance benefits in certain scenarios without causing any + performance regressions. + + This adds a new file /sys/fs/btrfs/BTRFS-UUID/read_policy. + + If unsure, say N. + config BTRFS_EXPERIMENTAL bool "Btrfs experimental features" depends on BTRFS_FS diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2ab550a1e715a7..8895a2b446eb5d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1126,6 +1126,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, if (ret) return ret; ptr += btrfs_extent_inline_ref_size(type); + cond_resched(); } return 0; @@ -1229,7 +1230,7 @@ static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, } if (ret) return ret; - + cond_resched(); } return ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 430e7419349c9a..2851e6eb61eb1d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2498,7 +2498,7 @@ static int __init btrfs_print_mod_info(void) #endif ; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES if (btrfs_get_mod_read_policy() == NULL) pr_info("Btrfs loaded%s\n", options); else @@ -2565,7 +2565,7 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = btrfs_extent_map_init, .exit_func = btrfs_extent_map_exit, -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES }, { .init_func = btrfs_read_policy_init, .exit_func = NULL, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 81f52c1f55ce57..9263125a064cd3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -26,6 +26,10 @@ #include "fs.h" #include "accessors.h" +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS +#include +#endif + /* * Structure name Path * -------------------------------------------------------------------------- @@ -1319,13 +1323,14 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char *btrfs_read_policy_name[] = { "pid", -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES "round-robin", + "queue", "devid", #endif }; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* Global module configuration parameters. */ static char *read_policy; @@ -1337,7 +1342,7 @@ char *btrfs_get_mod_read_policy(void) /* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ module_param(read_policy, charp, 0); MODULE_PARM_DESC(read_policy, -"Global read policy: pid (default), round-robin[:], devid[:]"); +"Global read policy: pid (default), round-robin[:], queue, devid[:]"); #endif int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) @@ -1350,7 +1355,7 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) strscpy(param, str); -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { @@ -1372,7 +1377,7 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) return sysfs_match_string(btrfs_read_policy_name, param); } -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES int __init btrfs_read_policy_init(void) { s64 value; @@ -1403,7 +1408,7 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES if (i == BTRFS_READ_POLICY_RR) ret += sysfs_emit_at(buf, ret, ":%u", READ_ONCE(fs_devices->rr_min_contig_read)); @@ -1433,7 +1438,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, if (index < 0) return -EINVAL; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* If moving from RR then disable collecting fs stats. */ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) fs_devices->collect_fs_stats = false; @@ -2140,12 +2145,110 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS +static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); +} + +static ssize_t btrfs_devinfo_type_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_device *device; + int ret; + struct btrfs_trans_handle *trans; + + u64 type, prev_type; + + device = container_of(kobj, struct btrfs_device, devid_kobj); + fs_info = device->fs_info; + if (!fs_info) + return -EPERM; + + /* + * Changing the type field requires starting a transaction which will cause a NULL derefernce in + * __reserve_bytes if the file system is not fully open. Thus, return EBUSY if the file system is not fully + * initialized. + */ + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return -EBUSY; + + root = fs_info->chunk_root; + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtou64(buf, 0, &type); + if (ret < 0) + return -EINVAL; + + /* for now, only allow touching the 'allocation hint' bits */ + if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)) + return -EINVAL; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + prev_type = device->type; + device->type = type; + + ret = btrfs_update_device(trans, device); + + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto abort; + } + + ret = btrfs_commit_transaction(trans); + if (ret < 0) + goto abort; + + return len; +abort: + device->type = prev_type; + return ret; +} +BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store); +#endif + +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS +static ssize_t btrfs_devinfo_read_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + u64 read_wait = part_stat_read(device->bdev, nsecs[READ]); + unsigned long read_ios = part_stat_read(device->bdev, ios[READ]); + + u64 avg_wait = 0; + if (read_wait && read_ios && read_wait >= read_ios) + avg_wait = div_u64(read_wait, read_ios); + + return scnprintf(buf, PAGE_SIZE, "ios %lu wait %llu avg %llu age %llu ignored %llu\n", + read_ios, read_wait, avg_wait, + (u64)atomic64_read(&device->last_io_age), + (u64)atomic64_read(&device->stripe_ignored)); +} +BTRFS_ATTR(devid, read_stats, btrfs_devinfo_read_stats_show); +#endif + /* * Information about one device. * * Path: /sys/fs/btrfs//devinfo// */ static struct attribute *devid_attrs[] = { +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + BTRFS_ATTR_PTR(devid, read_stats), +#endif BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), @@ -2153,6 +2256,9 @@ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, replace_target), BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + BTRFS_ATTR_PTR(devid, type), +#endif NULL }; ATTRIBUTE_GROUPS(devid); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 0f94ae9232101b..ef1bd5024be4d4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -50,7 +50,7 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); int btrfs_read_policy_to_enum(const char *str, s64 *value); -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES int __init btrfs_read_policy_init(void); char *btrfs_get_mod_read_policy(void); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba300..69e24f5b7d4705 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,10 @@ #include "super.h" #include "raid-stripe-tree.h" +#ifdef CONFIG_BTRFS_READ_POLICIES +#include +#endif + #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -184,6 +188,23 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags return BTRFS_BG_FLAG_TO_INDEX(profile); } +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS +#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1) +#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) + +static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_NONE_ONLY] = -99, + [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, + [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, + [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99, + /* the other values are set to 0 */ +}; +#endif + const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); @@ -1265,7 +1286,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; fs_devices->read_devid = latest_dev->devid; fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), @@ -2942,7 +2963,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; } -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, +noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; @@ -5089,13 +5110,20 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, } /* - * sort the devices in descending order by max_avail, total_avail + * sort the devices in descending order by alloc_hint (optional), + * max_avail, total_avail */ static int btrfs_cmp_device_info(const void *a, const void *b) { const struct btrfs_device_info *di_a = a; const struct btrfs_device_info *di_b = b; +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + if (di_a->alloc_hint > di_b->alloc_hint) + return -1; + if (di_a->alloc_hint < di_b->alloc_hint) + return 1; +#endif if (di_a->max_avail > di_b->max_avail) return -1; if (di_a->max_avail < di_b->max_avail) @@ -5303,16 +5331,110 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; + +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && + (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { + /* + * if mixed bg set all the alloc_hint + * fields to the same value, so the sorting + * is not affected + */ + devices_info[ndevs].alloc_hint = 0; + } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { + int hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_METADATA_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_METADATA_ONLY == hint) + continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_NONE_ONLY == hint) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (data disk + * higher priority) + */ + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + } else { /* BTRFS_BLOCK_GROUP_METADATA */ + int hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_DATA_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_DATA_ONLY == hint) + continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (BTRFS_DEV_ALLOCATION_NONE_ONLY == hint) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (metadata hint + * higher priority) + */ + if (BTRFS_DEV_ALLOCATION_PREFERRED_NONE == hint) + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + else + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + } +#endif + ++ndevs; } ctl->ndevs = ndevs; +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + /* + * no devices available + */ + if (!ndevs) + return 0; +#endif + /* * now sort the devices by hole size / available space */ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + /* + * select the minimum set of disks grouped by hint that + * can host the chunk + */ + ndevs = 0; + while (ndevs < ctl->ndevs) { + int hint = devices_info[ndevs++].alloc_hint; + while (ndevs < ctl->ndevs && + devices_info[ndevs].alloc_hint == hint) + ndevs++; + if (ndevs >= ctl->devs_min) + break; + } + + BUG_ON(ndevs > ctl->ndevs); + ctl->ndevs = ndevs; + + /* + * the next layers require the devices_info ordered by + * max_avail. If we are returing two (or more) different + * group of alloc_hint, this is not always true. So sort + * these gain. + */ + + for (int i = 0 ; i < ndevs ; i++) + devices_info[i].alloc_hint = 0; + + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); +#endif + return 0; } @@ -5932,7 +6054,45 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES +static unsigned int part_in_flight(struct block_device *part) +{ + unsigned int inflight = 0; + int cpu; + + for_each_possible_cpu(cpu) { + inflight += part_stat_local_read_cpu(part, in_flight[READ], cpu) + + part_stat_local_read_cpu(part, in_flight[WRITE], cpu); + } + if ((int)inflight < 0) + inflight = 0; + + return inflight; +} + +/* + * btrfs_earliest_stripe + * + * Select a stripe from the device with shortest in-flight requests. + */ +static int btrfs_read_earliest(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, int first, + int num_stripes) +{ + u64 best_in_flight = U64_MAX; + int best_stripe = 0; + + for (int index = first; index < first + num_stripes; index++) { + u64 in_flight = part_in_flight(map->stripes[index].dev->bdev); + if (best_in_flight > in_flight) { + best_in_flight = in_flight; + best_stripe = index; + } + } + + return best_stripe; +} + static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) { for (int index = first; index < first + num_stripes; index++) { @@ -6021,6 +6181,15 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + /* age each possible stripe by 1 IO */ + for (int i = first; i < first + num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + atomic64_inc(&device->last_io_age); + atomic64_inc(&device->stripe_ignored); + } +#endif + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ @@ -6031,10 +6200,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES case BTRFS_READ_POLICY_RR: preferred_mirror = btrfs_read_rr(map, first, num_stripes); break; + case BTRFS_READ_POLICY_QUEUE: + preferred_mirror = btrfs_read_earliest(fs_info, map, first, + num_stripes); + break; case BTRFS_READ_POLICY_DEVID: preferred_mirror = btrfs_read_preferred(map, first, num_stripes); break; @@ -6056,14 +6229,29 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, for (tolerance = 0; tolerance < 2; tolerance++) { if (map->stripes[preferred_mirror].dev->bdev && (tolerance || map->stripes[preferred_mirror].dev != srcdev)) - return preferred_mirror; + goto out; for (i = first; i < first + num_stripes; i++) { if (map->stripes[i].dev->bdev && - (tolerance || map->stripes[i].dev != srcdev)) - return i; + (tolerance || map->stripes[i].dev != srcdev)) { + preferred_mirror = i; + goto out; + } } } +out: +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + do { + struct btrfs_device *preferred_device = map->stripes[preferred_mirror].dev; + + /* reset age of selected stripe */ + atomic64_set(&preferred_device->last_io_age, 0); + + /* do not count ignores for the selected stripe */ + atomic64_dec(&preferred_device->stripe_ignored); + } while (0); +#endif + /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2cbf8080eade06..1d02a79e7b91a6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -201,6 +201,14 @@ struct btrfs_device { /* Bandwidth limit for scrub, in bytes */ u64 scrub_speed_max; + +#ifdef CONFIG_BTRFS_PER_DEVICE_IO_STATS + /* store an age of last read access */ + atomic64_t last_io_age; + + /* store how often a stripe has been ignored as a read candidate */ + atomic64_t stripe_ignored; +#endif }; /* @@ -310,9 +318,11 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* Balancing RAID1 reads across all striped devices (round-robin). */ BTRFS_READ_POLICY_RR, + /* Read from the device with the least in-flight requests */ + BTRFS_READ_POLICY_QUEUE, /* Read from a specific device. */ BTRFS_READ_POLICY_DEVID, #endif @@ -455,7 +465,7 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; -#ifdef CONFIG_BTRFS_EXPERIMENTAL +#ifdef CONFIG_BTRFS_READ_POLICIES /* * Minimum contiguous reads before switching to next device, the unit * is one block/sectorsize. @@ -464,7 +474,9 @@ struct btrfs_fs_devices { /* Device to be used for reading in case of RAID1. */ u64 read_devid; +#endif +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif @@ -599,6 +611,9 @@ struct btrfs_device_info { u64 dev_offset; u64 max_avail; u64 total_avail; +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS + int alloc_hint; +#endif }; struct btrfs_raid_attr { @@ -890,6 +905,8 @@ int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d84..2c8f407be8a442 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -578,6 +578,27 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); +#ifdef CONFIG_BTRFS_ALLOCATOR_HINTS +/* dev_item.type */ + +/* btrfs chunk allocation hints */ +#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3 +/* preferred data chunk, but metadata chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL) +/* preferred metadata chunk, but data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL) +/* only metadata chunk allowed */ +#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) +/* only data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) +/* avoid chunk allocation if possible */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) +/* deny chunk allocation */ +#define BTRFS_DEV_ALLOCATION_NONE_ONLY (5ULL) +/* 6..7 are unused values */ + +#endif + struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid;