From e6d9e3f0d72602978d20b21b00e209de06ae0e5d Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Tue, 11 Oct 2016 12:04:52 -0700 Subject: [PATCH 1/3] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints Add sysfs entries to support to hint for bypass/writeback by the ioprio assigned to the bio. If the bio is unassigned, use current's io-context ioprio for cache writeback or bypass (configured per-process with `ionice`). Having idle IOs bypass the cache can increase performance elsewhere since you probably don't care about their performance. In addition, this prevents idle IOs from promoting into (polluting) your cache and evicting blocks that are more important elsewhere. If you really nead the performance at the expense of SSD wearout, then configure ioprio_writeback and set your `ionice` appropriately. For example: echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback See the documentation commit for details. v2: Removed calls to get_task_io_context() Signed-off-by: Eric Wheeler Acked-by: Kent Overstreet Tested-by: Kai Krakow Cc: nix@esperi.org.uk Signed-off-by: Kai Krakow --- drivers/md/bcache/bcache.h | 3 ++ drivers/md/bcache/request.c | 10 +++++ drivers/md/bcache/sysfs.c | 70 +++++++++++++++++++++++++++++++++++ drivers/md/bcache/writeback.c | 10 +++++ drivers/md/bcache/writeback.h | 10 +++++ 5 files changed, 103 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 785b0d9008face..cd1f5e9f6b06b5 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -405,6 +405,9 @@ struct cached_dev { */ #define BCH_WBRATE_UPDATE_MAX_SKIPS 15 unsigned int rate_update_retry; + + unsigned short ioprio_writeback; + unsigned short ioprio_bypass; }; enum alloc_reserve { diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index af345dc6fde14f..b035c9f920803d 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -367,6 +367,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) unsigned int sectors, congested; struct task_struct *task = current; struct io *i; + unsigned short ioprio; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || (bio_op(bio) == REQ_OP_DISCARD)) @@ -408,6 +409,15 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } + /* If process ioprio is lower-or-equal to dc->ioprio_bypass, then + * hint for bypass. Note that a lower-priority IO class+value + * has a greater numeric value. */ + ioprio = bio_prio(bio); + if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio >= dc->ioprio_bypass) { + goto skip; + } + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || bio_sectors(bio) & (c->cache->sb.block_size - 1)) { pr_debug("skipping unaligned io\n"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e8f696cb58c056..530806942a3af0 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -152,6 +152,9 @@ rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); +rw_attribute(ioprio_writeback); +rw_attribute(ioprio_bypass); + static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], @@ -283,6 +286,16 @@ SHOW(__bch_cached_dev) return strlen(buf); } + if (attr == &sysfs_ioprio_bypass) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_bypass), + IOPRIO_PRIO_DATA(dc->ioprio_bypass)); + + if (attr == &sysfs_ioprio_writeback) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_writeback), + IOPRIO_PRIO_DATA(dc->ioprio_writeback)); + #undef var return 0; } @@ -295,6 +308,10 @@ STORE(__cached_dev) ssize_t v; struct cache_set *c; struct kobj_uevent_env *env; + unsigned ioprio_class = 0; /* invalid initial ioprio values */ + unsigned ioprio_level = IOPRIO_BE_NR; + unsigned short *ioprio_hint = NULL; + char *ioprio_type = NULL; /* no user space access if system is rebooting */ if (bcache_is_reboot) @@ -451,6 +468,57 @@ STORE(__cached_dev) if (attr == &sysfs_stop) bcache_device_stop(&dc->disk); + /* ioprio hinting: we use ioprio_hint to reduce duplicate printk verbiage */ + if (attr == &sysfs_ioprio_writeback) { + ioprio_hint = &dc->ioprio_writeback; + ioprio_type = "writeback"; + } + + if (attr == &sysfs_ioprio_bypass) { + ioprio_hint = &dc->ioprio_bypass; + ioprio_type = "bypass"; + } + + if (ioprio_hint != NULL) + { + if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_level) != 2 + || ioprio_class > IOPRIO_CLASS_IDLE + || ioprio_level >= IOPRIO_BE_NR) { + pr_err("ioprio_%s invalid, expecting: (class,level) but parsed (%u,%u); ignored.", + ioprio_type, + ioprio_class, ioprio_level); + return size; + } + + /* Use the maximum(/minimum) value in the class shift space to make integer + comparison correct for ioprio_writeback(/ioprio_bypass) for IOPRIO_CLASS_IDLE. + This is necessary because there are no ioprio levels for the idle class. */ + if (ioprio_class == IOPRIO_CLASS_IDLE) { + if (ioprio_hint == &dc->ioprio_writeback) + ioprio_level = IOPRIO_PRIO_MASK; + else + /* Same, but 0 for bypass (inverted vs. writeback) */ + ioprio_level = 0; + } + + *ioprio_hint = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_level); + + if (!ioprio_valid(*ioprio_hint)) + pr_info("disabled ioprio_%s hints.", ioprio_type); + else + pr_info("set hint for cache %s with priority %s: (class,level) = (%u,%u)", + ioprio_type, + ( ioprio_hint == &dc->ioprio_writeback ? "at-or-above" : "at-or-below" ), + ioprio_class, ioprio_level); + + if (ioprio_valid(dc->ioprio_writeback) + && ioprio_valid(dc->ioprio_bypass) + && dc->ioprio_writeback >= dc->ioprio_bypass) + pr_warn( + "warning: ioprio_writeback hint is neither disabled nor higher priority than the bypass hint; " + "will always writeback!\n"); + } + return size; } @@ -541,6 +609,8 @@ static struct attribute *bch_cached_dev_attrs[] = { #endif &sysfs_backing_dev_name, &sysfs_backing_dev_uuid, + &sysfs_ioprio_bypass, + &sysfs_ioprio_writeback, NULL }; ATTRIBUTE_GROUPS(bch_cached_dev); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c1d28e365910b9..22721fcfd7d38d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1071,6 +1071,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) /* For dc->writeback_lock contention in update_writeback_rate() */ dc->rate_update_retry = 0; + + /* + * These defaults provide the best SSD life by enabling bypass + * for priorities at-or-below BE-7. This also provides better + * performance (cache hits) by preventing (near-)idle processes from + * polluting the cache working set. Only set ioprio_writeback if + * you really need it: it will wear out your SSD sooner. + */ + dc->ioprio_writeback = IOPRIO_PRIO_VALUE(0, 0); + dc->ioprio_bypass = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, (IOPRIO_BE_NR-1)); WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 31df716951f66b..bf74e2832f0dfa 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -103,6 +103,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { unsigned int in_use = dc->disk.c->gc_stats.in_use; + unsigned short ioprio; if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || @@ -120,6 +121,15 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; + /* If process ioprio is higher-or-equal to dc->ioprio_writeback, then + * hint for writeback. Note that a higher-priority IO class+value + * has a lesser numeric value. */ + ioprio = bio_prio(bio); + if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio <= dc->ioprio_writeback) { + return true; + } + return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || in_use <= bch_cutoff_writeback); From 446bc2ce6af93246e765f5600515c6f3f4518439 Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Tue, 11 Oct 2016 12:08:13 -0700 Subject: [PATCH 2/3] bcache: documentation for sysfs entries describing bcache cache hinting v2: get_task_io_context() no longer exists in the kernel Signed-off-by: Eric Wheeler Signed-off-by: Kai Krakow --- Documentation/admin-guide/bcache.rst | 79 ++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index 6fdb495ac46652..c2d3fd546f8282 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -147,6 +147,85 @@ the backing devices to passthrough mode. writeback mode). It currently doesn't do anything intelligent if it fails to read some of the dirty data, though. +SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY +--------------------------------------------------------- + +Processes can be assigned an IO priority using `ionice` and bcache will +either try to writeback or bypass the cache based on the IO priority +level assigned to the process and the configuration of the syfs ioprio +hints. If configured properly for your workload, this can both increase +performance and reduce SSD wear (erase/write cycles). + +Having idle IOs bypass the cache can increase performance elsewhere +since you probably don't care about their performance. In addition, +this prevents idle IOs from promoting into (polluting) your cache and +evicting blocks that are more important elsewhere. + +Default sysfs values: + 2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7. + 0,0: ioprio_writeback hinting is disabled by default. + +Cache hinting is configured by writing 'class,level' pairs to sysfs. +In this example, we write the following: + + echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass + echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback + +Thus, processes with the following IO class (ionice -c) and level (-n) +will the behave as shown in this table: + + (-c) IO Class (-n) Class level Action + ----------------------------------------------------- + (1) Realtime 0-7 Writeback + (2) Best-effort 0 Writeback + (2) Best-effort 1-6 Normal, as if hinting were disabled + (2) Best-effort 7 Bypass cache + (3) Idle n/a Bypass cache + +For processes at-or-below best-effort-7 (ionice -c2 -n7), the +ioprio_bypass behavior is as follows: + +* Reads will come from the backing device and will not promote into + (pollute) your cache. If the block being read was already in the cache, + then it will be read from the cache (and remain cached). + +* If you are using writeback mode, then low-priority bypass-hinted writes + will go directly to the backing device. If the write was dirty in + cache, it will cache-invalidate and write directly to the backing + device. If a high-priority task later writes the same block then it + will writeback so no performance is lost for write-after-write. + + For read-after-bypassed-write, the block will be read from the backing + device (not cached) so there may be a miss penalty when a low-priority + process write bypasses the cache followed by a high-priority read that + would otherwise have hit. In practice, this is not an issue; to date, + none have wanted low-priority writes and high-priority reads of the + same block. + +For processes in our example at-or-above best-effort-0 (ionice -c2 -n0), +the ioprio_writeback behavior is as follows: + +* The writeback hint has no effect unless your 'cache_mode' is writeback. + Assuming writeback mode, all writes at this priority will writeback. + Of course this will increase SSD wear, so only use writeback hinting + if you need it. + +* Reads are unaffected by ioprio_writeback, except that read-after-write + will of course read from the cache. + +Linux assigns processes the best-effort class with a level of 4 if +no process is assigned Thus, without `ionice` your processes will +follow normal bcache should_writeback/should_bypass symantecs as if the +ioprio_writeback/ioprio_bypass sysfs flags were disabled. + +Also note that in order to be hinted by ioprio_writeback/ioprio_bypass, +the process must have a valid ioprio setting as returned by bio_prio(). +Thus, a process without an IO context will be ignored by the +ioprio_writeback/ioprio_bypass hints even if your sysfs hints specify that +best-effort-4 should be flagged for bypass or writeback. If in doubt, +explicitly set the process IO priority with `ionice`. + +See `man ionice` for more detail about per-process IO priority in Linux. Howto/cookbook -------------- From 6319846c7c059e4a0437aa376f57748b93287923 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Sat, 3 Oct 2020 12:18:24 +0200 Subject: [PATCH 3/3] bcache: Only skip data request in io_prio bypass mode Even if ioprio_bypass hints bypassing the request, we still allow it for REQ_META|REQ_PRIO bio. Cc: Eric Wheeler Signed-off-by: Kai Krakow --- drivers/md/bcache/request.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index b035c9f920803d..bdf9c2d21b7a93 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -409,11 +409,12 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } - /* If process ioprio is lower-or-equal to dc->ioprio_bypass, then - * hint for bypass. Note that a lower-priority IO class+value - * has a greater numeric value. */ + /* If process ioprio is lower-or-equal to dc->ioprio_bypass, and the + * request is not REQ_META|REQ_PRIO, then hint for bypass. Note that a + * lower-priority IO class+value has a greater numeric value. */ ioprio = bio_prio(bio); - if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) + && ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) && ioprio >= dc->ioprio_bypass) { goto skip; }