From a8b349f5900542adf034582fab56b752a32b9ff8 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 13 Dec 2025 21:43:20 +0100 Subject: [PATCH 1/3] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints Add sysfs entries to support to hint for bypass/writeback by the ioprio assigned to the bio. If the bio is unassigned, use current's io-context ioprio for cache writeback or bypass (configured per-process with `ionice`). Having idle IOs bypass the cache can increase performance elsewhere since you probably don't care about their performance. In addition, this prevents idle IOs from promoting into (polluting) your cache and evicting blocks that are more important elsewhere. If you really nead the performance at the expense of SSD wearout, then configure ioprio_writeback and set your `ionice` appropriately. For example: echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback See the documentation commit for details. v2: Removed calls to get_task_io_context() v3: bio_prio() macro has been deleted Co-authored-by: Eric Wheeler Signed-off-by: Eric Wheeler Acked-by: Kent Overstreet Tested-by: Kai Krakow Cc: nix@esperi.org.uk Signed-off-by: Kai Krakow --- drivers/md/bcache/bcache.h | 3 ++ drivers/md/bcache/request.c | 10 +++++ drivers/md/bcache/sysfs.c | 70 +++++++++++++++++++++++++++++++++++ drivers/md/bcache/writeback.c | 10 +++++ drivers/md/bcache/writeback.h | 10 +++++ 5 files changed, 103 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 1d33e40d26ea51..747a40374865ab 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -405,6 +405,9 @@ struct cached_dev { */ #define BCH_WBRATE_UPDATE_MAX_SKIPS 15 unsigned int rate_update_retry; + + unsigned short ioprio_writeback; + unsigned short ioprio_bypass; }; enum alloc_reserve { diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index af345dc6fde14f..a962e24c597fb6 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -367,6 +367,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) unsigned int sectors, congested; struct task_struct *task = current; struct io *i; + unsigned short ioprio; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || (bio_op(bio) == REQ_OP_DISCARD)) @@ -408,6 +409,15 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } + /* If process ioprio is lower-or-equal to dc->ioprio_bypass, then + * hint for bypass. Note that a lower-priority IO class+value + * has a greater numeric value. */ + ioprio = bio->bi_ioprio; + if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio >= dc->ioprio_bypass) { + goto skip; + } + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || bio_sectors(bio) & (c->cache->sb.block_size - 1)) { pr_debug("skipping unaligned io\n"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 826b14cae4e58e..ecadde82dc32f5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -152,6 +152,9 @@ rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); +rw_attribute(ioprio_writeback); +rw_attribute(ioprio_bypass); + static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], @@ -283,6 +286,16 @@ SHOW(__bch_cached_dev) return strlen(buf); } + if (attr == &sysfs_ioprio_bypass) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_bypass), + IOPRIO_PRIO_DATA(dc->ioprio_bypass)); + + if (attr == &sysfs_ioprio_writeback) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_writeback), + IOPRIO_PRIO_DATA(dc->ioprio_writeback)); + #undef var return 0; } @@ -295,6 +308,10 @@ STORE(__cached_dev) ssize_t v; struct cache_set *c; struct kobj_uevent_env *env; + unsigned ioprio_class = 0; /* invalid initial ioprio values */ + unsigned ioprio_level = IOPRIO_BE_NR; + unsigned short *ioprio_hint = NULL; + char *ioprio_type = NULL; /* no user space access if system is rebooting */ if (bcache_is_reboot) @@ -451,6 +468,57 @@ STORE(__cached_dev) if (attr == &sysfs_stop) bcache_device_stop(&dc->disk); + /* ioprio hinting: we use ioprio_hint to reduce duplicate printk verbiage */ + if (attr == &sysfs_ioprio_writeback) { + ioprio_hint = &dc->ioprio_writeback; + ioprio_type = "writeback"; + } + + if (attr == &sysfs_ioprio_bypass) { + ioprio_hint = &dc->ioprio_bypass; + ioprio_type = "bypass"; + } + + if (ioprio_hint != NULL) + { + if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_level) != 2 + || ioprio_class > IOPRIO_CLASS_IDLE + || ioprio_level >= IOPRIO_BE_NR) { + pr_err("ioprio_%s invalid, expecting: (class,level) but parsed (%u,%u); ignored.", + ioprio_type, + ioprio_class, ioprio_level); + return size; + } + + /* Use the maximum(/minimum) value in the class shift space to make integer + comparison correct for ioprio_writeback(/ioprio_bypass) for IOPRIO_CLASS_IDLE. + This is necessary because there are no ioprio levels for the idle class. */ + if (ioprio_class == IOPRIO_CLASS_IDLE) { + if (ioprio_hint == &dc->ioprio_writeback) + ioprio_level = IOPRIO_PRIO_MASK; + else + /* Same, but 0 for bypass (inverted vs. writeback) */ + ioprio_level = 0; + } + + *ioprio_hint = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_level); + + if (!ioprio_valid(*ioprio_hint)) + pr_info("disabled ioprio_%s hints.", ioprio_type); + else + pr_info("set hint for cache %s with priority %s: (class,level) = (%u,%u)", + ioprio_type, + ( ioprio_hint == &dc->ioprio_writeback ? "at-or-above" : "at-or-below" ), + ioprio_class, ioprio_level); + + if (ioprio_valid(dc->ioprio_writeback) + && ioprio_valid(dc->ioprio_bypass) + && dc->ioprio_writeback >= dc->ioprio_bypass) + pr_warn( + "warning: ioprio_writeback hint is neither disabled nor higher priority than the bypass hint; " + "will always writeback!\n"); + } + return size; } @@ -541,6 +609,8 @@ static struct attribute *bch_cached_dev_attrs[] = { #endif &sysfs_backing_dev_name, &sysfs_backing_dev_uuid, + &sysfs_ioprio_bypass, + &sysfs_ioprio_writeback, NULL }; ATTRIBUTE_GROUPS(bch_cached_dev); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6ba73dc1a3dff1..99f0c050bccd4b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1068,6 +1068,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) /* For dc->writeback_lock contention in update_writeback_rate() */ dc->rate_update_retry = 0; + + /* + * These defaults provide the best SSD life by enabling bypass + * for priorities at-or-below BE-7. This also provides better + * performance (cache hits) by preventing (near-)idle processes from + * polluting the cache working set. Only set ioprio_writeback if + * you really need it: it will wear out your SSD sooner. + */ + dc->ioprio_writeback = IOPRIO_PRIO_VALUE(0, 0); + dc->ioprio_bypass = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, (IOPRIO_BE_NR-1)); WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 31df716951f66b..7710eeae19f361 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -103,6 +103,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { unsigned int in_use = dc->disk.c->gc_stats.in_use; + unsigned short ioprio; if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || @@ -120,6 +121,15 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; + /* If process ioprio is higher-or-equal to dc->ioprio_writeback, then + * hint for writeback. Note that a higher-priority IO class+value + * has a lesser numeric value. */ + ioprio = bio->bi_ioprio; + if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio <= dc->ioprio_writeback) { + return true; + } + return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || in_use <= bch_cutoff_writeback); From 9dd53d2ef99f256b67296615bde594b66652202b Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Tue, 11 Oct 2016 12:08:13 -0700 Subject: [PATCH 2/3] bcache: documentation for sysfs entries describing bcache cache hinting v2: get_task_io_context() no longer exists in the kernel Signed-off-by: Eric Wheeler Signed-off-by: Kai Krakow --- Documentation/admin-guide/bcache.rst | 79 ++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index 6fdb495ac46652..c2d3fd546f8282 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -147,6 +147,85 @@ the backing devices to passthrough mode. writeback mode). It currently doesn't do anything intelligent if it fails to read some of the dirty data, though. +SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY +--------------------------------------------------------- + +Processes can be assigned an IO priority using `ionice` and bcache will +either try to writeback or bypass the cache based on the IO priority +level assigned to the process and the configuration of the syfs ioprio +hints. If configured properly for your workload, this can both increase +performance and reduce SSD wear (erase/write cycles). + +Having idle IOs bypass the cache can increase performance elsewhere +since you probably don't care about their performance. In addition, +this prevents idle IOs from promoting into (polluting) your cache and +evicting blocks that are more important elsewhere. + +Default sysfs values: + 2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7. + 0,0: ioprio_writeback hinting is disabled by default. + +Cache hinting is configured by writing 'class,level' pairs to sysfs. +In this example, we write the following: + + echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass + echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback + +Thus, processes with the following IO class (ionice -c) and level (-n) +will the behave as shown in this table: + + (-c) IO Class (-n) Class level Action + ----------------------------------------------------- + (1) Realtime 0-7 Writeback + (2) Best-effort 0 Writeback + (2) Best-effort 1-6 Normal, as if hinting were disabled + (2) Best-effort 7 Bypass cache + (3) Idle n/a Bypass cache + +For processes at-or-below best-effort-7 (ionice -c2 -n7), the +ioprio_bypass behavior is as follows: + +* Reads will come from the backing device and will not promote into + (pollute) your cache. If the block being read was already in the cache, + then it will be read from the cache (and remain cached). + +* If you are using writeback mode, then low-priority bypass-hinted writes + will go directly to the backing device. If the write was dirty in + cache, it will cache-invalidate and write directly to the backing + device. If a high-priority task later writes the same block then it + will writeback so no performance is lost for write-after-write. + + For read-after-bypassed-write, the block will be read from the backing + device (not cached) so there may be a miss penalty when a low-priority + process write bypasses the cache followed by a high-priority read that + would otherwise have hit. In practice, this is not an issue; to date, + none have wanted low-priority writes and high-priority reads of the + same block. + +For processes in our example at-or-above best-effort-0 (ionice -c2 -n0), +the ioprio_writeback behavior is as follows: + +* The writeback hint has no effect unless your 'cache_mode' is writeback. + Assuming writeback mode, all writes at this priority will writeback. + Of course this will increase SSD wear, so only use writeback hinting + if you need it. + +* Reads are unaffected by ioprio_writeback, except that read-after-write + will of course read from the cache. + +Linux assigns processes the best-effort class with a level of 4 if +no process is assigned Thus, without `ionice` your processes will +follow normal bcache should_writeback/should_bypass symantecs as if the +ioprio_writeback/ioprio_bypass sysfs flags were disabled. + +Also note that in order to be hinted by ioprio_writeback/ioprio_bypass, +the process must have a valid ioprio setting as returned by bio_prio(). +Thus, a process without an IO context will be ignored by the +ioprio_writeback/ioprio_bypass hints even if your sysfs hints specify that +best-effort-4 should be flagged for bypass or writeback. If in doubt, +explicitly set the process IO priority with `ionice`. + +See `man ionice` for more detail about per-process IO priority in Linux. Howto/cookbook -------------- From 7b897c179b1454a9ee744790fd904e26aed05d12 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 3 Oct 2020 12:18:24 +0200 Subject: [PATCH 3/3] bcache: Only skip data request in io_prio bypass mode Even if ioprio_bypass hints bypassing the request, we still allow it for REQ_META|REQ_PRIO bio. Cc: Eric Wheeler Signed-off-by: Kai Krakow --- drivers/md/bcache/request.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index a962e24c597fb6..1c5727ef3fb558 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -409,11 +409,12 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } - /* If process ioprio is lower-or-equal to dc->ioprio_bypass, then - * hint for bypass. Note that a lower-priority IO class+value - * has a greater numeric value. */ + /* If process ioprio is lower-or-equal to dc->ioprio_bypass, and the + * request is not REQ_META|REQ_PRIO, then hint for bypass. Note that a + * lower-priority IO class+value has a greater numeric value. */ ioprio = bio->bi_ioprio; - if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) + && ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) && ioprio >= dc->ioprio_bypass) { goto skip; }