diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index 6fdb495ac46652..c2d3fd546f8282 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -147,6 +147,85 @@ the backing devices to passthrough mode. writeback mode). It currently doesn't do anything intelligent if it fails to read some of the dirty data, though. +SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY +--------------------------------------------------------- + +Processes can be assigned an IO priority using `ionice` and bcache will +either try to writeback or bypass the cache based on the IO priority +level assigned to the process and the configuration of the syfs ioprio +hints. If configured properly for your workload, this can both increase +performance and reduce SSD wear (erase/write cycles). + +Having idle IOs bypass the cache can increase performance elsewhere +since you probably don't care about their performance. In addition, +this prevents idle IOs from promoting into (polluting) your cache and +evicting blocks that are more important elsewhere. + +Default sysfs values: + 2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7. + 0,0: ioprio_writeback hinting is disabled by default. + +Cache hinting is configured by writing 'class,level' pairs to sysfs. +In this example, we write the following: + + echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass + echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback + +Thus, processes with the following IO class (ionice -c) and level (-n) +will the behave as shown in this table: + + (-c) IO Class (-n) Class level Action + ----------------------------------------------------- + (1) Realtime 0-7 Writeback + (2) Best-effort 0 Writeback + (2) Best-effort 1-6 Normal, as if hinting were disabled + (2) Best-effort 7 Bypass cache + (3) Idle n/a Bypass cache + +For processes at-or-below best-effort-7 (ionice -c2 -n7), the +ioprio_bypass behavior is as follows: + +* Reads will come from the backing device and will not promote into + (pollute) your cache. If the block being read was already in the cache, + then it will be read from the cache (and remain cached). + +* If you are using writeback mode, then low-priority bypass-hinted writes + will go directly to the backing device. If the write was dirty in + cache, it will cache-invalidate and write directly to the backing + device. If a high-priority task later writes the same block then it + will writeback so no performance is lost for write-after-write. + + For read-after-bypassed-write, the block will be read from the backing + device (not cached) so there may be a miss penalty when a low-priority + process write bypasses the cache followed by a high-priority read that + would otherwise have hit. In practice, this is not an issue; to date, + none have wanted low-priority writes and high-priority reads of the + same block. + +For processes in our example at-or-above best-effort-0 (ionice -c2 -n0), +the ioprio_writeback behavior is as follows: + +* The writeback hint has no effect unless your 'cache_mode' is writeback. + Assuming writeback mode, all writes at this priority will writeback. + Of course this will increase SSD wear, so only use writeback hinting + if you need it. + +* Reads are unaffected by ioprio_writeback, except that read-after-write + will of course read from the cache. + +Linux assigns processes the best-effort class with a level of 4 if +no process is assigned Thus, without `ionice` your processes will +follow normal bcache should_writeback/should_bypass symantecs as if the +ioprio_writeback/ioprio_bypass sysfs flags were disabled. + +Also note that in order to be hinted by ioprio_writeback/ioprio_bypass, +the process must have a valid ioprio setting as returned by bio_prio(). +Thus, a process without an IO context will be ignored by the +ioprio_writeback/ioprio_bypass hints even if your sysfs hints specify that +best-effort-4 should be flagged for bypass or writeback. If in doubt, +explicitly set the process IO priority with `ionice`. + +See `man ionice` for more detail about per-process IO priority in Linux. Howto/cookbook -------------- diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 1d33e40d26ea51..747a40374865ab 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -405,6 +405,9 @@ struct cached_dev { */ #define BCH_WBRATE_UPDATE_MAX_SKIPS 15 unsigned int rate_update_retry; + + unsigned short ioprio_writeback; + unsigned short ioprio_bypass; }; enum alloc_reserve { diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index af345dc6fde14f..1c5727ef3fb558 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -367,6 +367,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) unsigned int sectors, congested; struct task_struct *task = current; struct io *i; + unsigned short ioprio; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || (bio_op(bio) == REQ_OP_DISCARD)) @@ -408,6 +409,16 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } + /* If process ioprio is lower-or-equal to dc->ioprio_bypass, and the + * request is not REQ_META|REQ_PRIO, then hint for bypass. Note that a + * lower-priority IO class+value has a greater numeric value. */ + ioprio = bio->bi_ioprio; + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) + && ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio >= dc->ioprio_bypass) { + goto skip; + } + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || bio_sectors(bio) & (c->cache->sb.block_size - 1)) { pr_debug("skipping unaligned io\n"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 826b14cae4e58e..ecadde82dc32f5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -152,6 +152,9 @@ rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); +rw_attribute(ioprio_writeback); +rw_attribute(ioprio_bypass); + static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], @@ -283,6 +286,16 @@ SHOW(__bch_cached_dev) return strlen(buf); } + if (attr == &sysfs_ioprio_bypass) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_bypass), + IOPRIO_PRIO_DATA(dc->ioprio_bypass)); + + if (attr == &sysfs_ioprio_writeback) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_writeback), + IOPRIO_PRIO_DATA(dc->ioprio_writeback)); + #undef var return 0; } @@ -295,6 +308,10 @@ STORE(__cached_dev) ssize_t v; struct cache_set *c; struct kobj_uevent_env *env; + unsigned ioprio_class = 0; /* invalid initial ioprio values */ + unsigned ioprio_level = IOPRIO_BE_NR; + unsigned short *ioprio_hint = NULL; + char *ioprio_type = NULL; /* no user space access if system is rebooting */ if (bcache_is_reboot) @@ -451,6 +468,57 @@ STORE(__cached_dev) if (attr == &sysfs_stop) bcache_device_stop(&dc->disk); + /* ioprio hinting: we use ioprio_hint to reduce duplicate printk verbiage */ + if (attr == &sysfs_ioprio_writeback) { + ioprio_hint = &dc->ioprio_writeback; + ioprio_type = "writeback"; + } + + if (attr == &sysfs_ioprio_bypass) { + ioprio_hint = &dc->ioprio_bypass; + ioprio_type = "bypass"; + } + + if (ioprio_hint != NULL) + { + if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_level) != 2 + || ioprio_class > IOPRIO_CLASS_IDLE + || ioprio_level >= IOPRIO_BE_NR) { + pr_err("ioprio_%s invalid, expecting: (class,level) but parsed (%u,%u); ignored.", + ioprio_type, + ioprio_class, ioprio_level); + return size; + } + + /* Use the maximum(/minimum) value in the class shift space to make integer + comparison correct for ioprio_writeback(/ioprio_bypass) for IOPRIO_CLASS_IDLE. + This is necessary because there are no ioprio levels for the idle class. */ + if (ioprio_class == IOPRIO_CLASS_IDLE) { + if (ioprio_hint == &dc->ioprio_writeback) + ioprio_level = IOPRIO_PRIO_MASK; + else + /* Same, but 0 for bypass (inverted vs. writeback) */ + ioprio_level = 0; + } + + *ioprio_hint = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_level); + + if (!ioprio_valid(*ioprio_hint)) + pr_info("disabled ioprio_%s hints.", ioprio_type); + else + pr_info("set hint for cache %s with priority %s: (class,level) = (%u,%u)", + ioprio_type, + ( ioprio_hint == &dc->ioprio_writeback ? "at-or-above" : "at-or-below" ), + ioprio_class, ioprio_level); + + if (ioprio_valid(dc->ioprio_writeback) + && ioprio_valid(dc->ioprio_bypass) + && dc->ioprio_writeback >= dc->ioprio_bypass) + pr_warn( + "warning: ioprio_writeback hint is neither disabled nor higher priority than the bypass hint; " + "will always writeback!\n"); + } + return size; } @@ -541,6 +609,8 @@ static struct attribute *bch_cached_dev_attrs[] = { #endif &sysfs_backing_dev_name, &sysfs_backing_dev_uuid, + &sysfs_ioprio_bypass, + &sysfs_ioprio_writeback, NULL }; ATTRIBUTE_GROUPS(bch_cached_dev); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6ba73dc1a3dff1..99f0c050bccd4b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1068,6 +1068,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) /* For dc->writeback_lock contention in update_writeback_rate() */ dc->rate_update_retry = 0; + + /* + * These defaults provide the best SSD life by enabling bypass + * for priorities at-or-below BE-7. This also provides better + * performance (cache hits) by preventing (near-)idle processes from + * polluting the cache working set. Only set ioprio_writeback if + * you really need it: it will wear out your SSD sooner. + */ + dc->ioprio_writeback = IOPRIO_PRIO_VALUE(0, 0); + dc->ioprio_bypass = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, (IOPRIO_BE_NR-1)); WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 31df716951f66b..7710eeae19f361 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -103,6 +103,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { unsigned int in_use = dc->disk.c->gc_stats.in_use; + unsigned short ioprio; if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || @@ -120,6 +121,15 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; + /* If process ioprio is higher-or-equal to dc->ioprio_writeback, then + * hint for writeback. Note that a higher-priority IO class+value + * has a lesser numeric value. */ + ioprio = bio->bi_ioprio; + if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio <= dc->ioprio_writeback) { + return true; + } + return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || in_use <= bch_cutoff_writeback);