From 776ad0d42950e15be5ce1ff8e9872bdd8daf1e83 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 7 Dec 2021 15:05:09 -0800 Subject: [PATCH] zvol: Support blk-mq for better performance Add support for the kernel's block multiqueue (blk-mq) interface in the zvol block driver. blk-mq creates multiple request queues on different CPUs rather than having a single request queue. This can improve zvol performance with multithreaded reads/writes. This implementation uses the blk-mq interfaces on 4.13 or newer kernels. Building against older kernels will fall back to the older BIO interfaces. Note that you must set the `zvol_use_blk_mq` module param to enable the blk-mq API. It is disabled by default. In addition, this commit lets the zvol blk-mq layer process whole `struct request` IOs at a time, rather than breaking them down into their individual BIOs. This reduces dbuf lock contention and overhead versus the legacy zvol submit_bio() codepath. sequential dd to one zvol, 8k volblocksize, no O_DIRECT: legacy submit_bio() 292MB/s write 453MB/s read this commit 453MB/s write 885MB/s read It also introduces a new `zvol_blk_mq_chunks_per_thread` module parameter. This parameter represents how many volblocksize'd chunks to process per each zvol thread. It can be used to tune your zvols for better read vs write performance (higher values favor write, lower favor read). Signed-off-by: Tony Hutter Issue #12483 --- config/kernel-blk-queue.m4 | 32 + configure.ac | 1 + include/os/linux/kernel/linux/blkdev_compat.h | 87 +++ include/os/linux/spl/sys/uio.h | 37 +- man/man4/zfs.4 | 64 +- module/os/linux/zfs/zfs_uio.c | 153 ++++- module/os/linux/zfs/zvol_os.c | 631 ++++++++++++++---- tests/runfiles/common.run | 4 + tests/zfs-tests/include/libtest.shlib | 37 +- tests/zfs-tests/include/tunables.cfg | 1 + .../tests/functional/zvol/Makefile.am | 1 + .../functional/zvol/zvol_stress/Makefile.am | 5 + .../functional/zvol/zvol_stress/cleanup.ksh | 36 + .../functional/zvol/zvol_stress/setup.ksh | 38 ++ .../zvol/zvol_stress/zvol_stress.ksh | 171 +++++ 15 files changed, 1146 insertions(+), 152 deletions(-) create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index 559ae9800e8f..907a2c48ce67 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -315,6 +315,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ ]) ]) +dnl # +dnl # See if kernel supports block multi-queue and blk_status_t. +dnl # blk_status_t represents the new status codes introduced in the 4.13 +dnl # kernel patch: +dnl # +dnl # block: introduce new block status code type +dnl # +dnl # We do not currently support the "old" block multi-queue interfaces from +dnl # prior kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [ + ZFS_LINUX_TEST_SRC([blk_mq], [ + #include + ], [ + struct blk_mq_tag_set tag_set = {0}; + (void) blk_mq_alloc_tag_set(&tag_set); + return BLK_STS_OK; + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ + AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available]) + ZFS_LINUX_TEST_RESULT([blk_mq], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI @@ -326,6 +356,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_SRC_BLK_MQ ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ @@ -339,4 +370,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ ZFS_AC_KERNEL_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_BLK_MQ ]) diff --git a/configure.ac b/configure.ac index 7037c06b225f..dc711fe7b6fe 100644 --- a/configure.ac +++ b/configure.ac @@ -403,6 +403,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/zpool_influxdb/Makefile tests/zfs-tests/tests/functional/zvol/Makefile tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 9fa8884bb7a1..6d9c6c0f1a48 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -34,6 +34,7 @@ #include #include #include /* for SECTOR_* */ +#include #ifndef HAVE_BLK_QUEUE_FLAG_SET static inline void @@ -579,4 +580,90 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ +static inline int +io_data_dir(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) { + enum req_opf op = req_op(rq); + if (op_is_write(op)) { + return (WRITE); + } else { + return (READ); + } + } +#endif + return (bio_data_dir(bio)); +} + +static inline int +io_is_flush(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_FLUSH); +#endif + return (bio_is_flush(bio)); +} + +static inline int +io_is_discard(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_DISCARD); +#endif + return (bio_is_discard(bio)); +} + +static inline int +io_is_secure_erase(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_SECURE_ERASE); +#endif + return (bio_is_secure_erase(bio)); +} + +static inline int +io_is_fua(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (rq->cmd_flags & REQ_FUA); +#endif + return (bio_is_fua(bio)); +} + + +static inline uint64_t +io_offset(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (blk_rq_pos(rq) << 9); +#endif + return (BIO_BI_SECTOR(bio) << 9); +} + +static inline uint64_t +io_size(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (blk_rq_bytes(rq)); +#endif + return (BIO_BI_SIZE(bio)); +} + +static inline int +io_has_data(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (bio_has_data(rq->bio)); +#endif + return (bio_has_data(bio)); +} #endif /* _ZFS_BLKDEV_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 439eec986236..ed13a4398b25 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -68,10 +68,20 @@ typedef struct zfs_uio { boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; - ssize_t uio_resid; + ssize_t uio_resid; size_t uio_skip; + + struct request *rq; + + /* + * Used for saving rq_for_each_segment() state between calls + * to zfs_uiomove_bvec_rq(). + */ + struct req_iterator iter; + struct bio_vec bv; } zfs_uio_t; + #define zfs_uio_segflg(u) (u)->uio_segflg #define zfs_uio_offset(u) (u)->uio_loffset #define zfs_uio_resid(u) (u)->uio_resid @@ -116,17 +126,30 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, } static inline void -zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio) +zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) { - uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; - uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); - uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; + if (bio) { + uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); + uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; + } else { + uio->uio_bvec = NULL; + uio->uio_iovcnt = 0; + memset(&uio->iter, 0, sizeof (uio->iter)); + } + + uio->uio_loffset = io_offset(bio, rq); uio->uio_segflg = UIO_BVEC; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; - uio->uio_resid = BIO_BI_SIZE(bio); - uio->uio_skip = BIO_BI_SKIP(bio); + uio->uio_resid = io_size(bio, rq); + if (bio) { + uio->uio_skip = BIO_BI_SKIP(bio); + } else { + uio->uio_skip = 0; + } + + uio->rq = rq; } #if defined(HAVE_VFS_IOV_ITER) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index bb3cd2243ad3..eed90fbd2f51 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2207,9 +2207,67 @@ for each I/O submitter. When unset, requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controlled by .Sy zvol_threads . -. -.It Sy zvol_threads Ns = Ns Sy 32 Pq uint -Max number of threads which can handle zvol I/O requests concurrently. +.Sy zvol_request_sync +is ignored when running on a kernel that supports block multiqueue +.Pq Li blk-mq . +. +.It Sy zvol_threads Ns = Ns Sy 0 Pq uint +The number of threads to use for processing zvol block IOs. +On older +.No non- Ns Li blk-mq +kernels, +.Sy zvol_threads +is the total number of threads to use for all zvols. +On kernels that support +.Li blk-mq +.Sy zvol_threads +is the total number of threads per zvol. +If +.Sy 0 +(the default) then internally set +.Sy zvol_threads +to the number of CPUs present. +.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Set to +.Sy 1 +to use the +.Li blk-mq +API for zvols. +Set to +.Sy 0 +(the default) to use the legacy zvol APIs. +This setting can give better or worse zvol performance depending on +the workload. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint +If +.Sy zvol_use_blk_mq +is enabled, then process this number of volblocksize blocks per zvol thread. +This tunable can be use to favor better performance for zvol reads (lower +values) or writes (higher values). +If set to 0, then the zvol layer will process the maximum number of blocks +per thread that it can. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint +The queue_depth value for the zvol +.Li blk-mq +interface. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read at zvol load time. +If +.Sy 0 +(the default) then use the kernel's default queue depth. +If you set +.Sy zvol_blk_mq_queue_depth +lower than the kernel's minimum queue depth, it will be internally +capped to the kernel's minimum queue depth (currently 4 on 5.15 kernels). . .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index a3d5d5f83b6f..ece9cfe8dfdb 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -47,6 +47,7 @@ #include #include #include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -126,7 +127,7 @@ zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) } static int -zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { const struct bio_vec *bv = uio->uio_bvec; size_t skip = uio->uio_skip; @@ -137,10 +138,13 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) cnt = MIN(bv->bv_len - skip, n); paddr = zfs_kmap_atomic(bv->bv_page); - if (rw == UIO_READ) + if (rw == UIO_READ) { + /* Copy from buffer 'p' to the bvec data */ bcopy(p, paddr + bv->bv_offset + skip, cnt); - else + } else { + /* Copy from bvec data to buffer 'p' */ bcopy(paddr + bv->bv_offset + skip, p, cnt); + } zfs_kunmap_atomic(paddr); skip += cnt; @@ -158,6 +162,139 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) return (0); } +#ifdef HAVE_BLK_MQ +static void +zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, + struct bio_vec *bv) +{ + void *paddr; + + paddr = zfs_kmap_atomic(bv->bv_page); + if (rw == UIO_READ) { + /* Copy from buffer 'p' to the bvec data */ + bcopy(p, paddr + bv->bv_offset + skip, cnt); + } else { + /* Copy from bvec data to buffer 'p' */ + bcopy(paddr + bv->bv_offset + skip, p, cnt); + } + zfs_kunmap_atomic(paddr); +} + +/* + * Copy 'n' bytes of data between the buffer p[] and the data represented + * by the request in the uio. + */ +static int +zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ + struct request *rq = uio->rq; + struct bio_vec bv; + struct req_iterator iter; + size_t this_seg_start; /* logical offset */ + size_t this_seg_end; /* logical offset */ + size_t skip_in_seg; + size_t copy_from_seg; + size_t orig_loffset; + int copied = 0; + + /* + * Get the original logical offset of this entire request (because + * uio->uio_loffset will be modified over time). + */ + orig_loffset = io_offset(NULL, rq); + this_seg_start = orig_loffset; + + rq_for_each_segment(bv, rq, iter) { + if (uio->iter.bio) { + /* + * If uio->iter.bio is present, then we know we've saved + * uio->iter from a previous call to this function, and + * we can skip ahead in this rq_for_each_segment() loop + * to where we last left off. That way, we don't need + * to iterate over tons of segments we've already + * processed - we can just restore the "saved state". + */ + iter = uio->iter; + bv = uio->bv; + this_seg_start = uio->uio_loffset; + memset(&uio->iter, 0, sizeof (uio->iter)); + continue; + } + + /* + * Lookup what the logical offset of the last byte of this + * segment is. + */ + this_seg_end = this_seg_start + bv.bv_len - 1; + + /* + * We only need to operate on segments that have data we're + * copying. + */ + if (uio->uio_loffset >= this_seg_start && + uio->uio_loffset <= this_seg_end) { + /* + * Some, or all, of the data in this segment needs to be + * copied. + */ + + /* + * We may be not be copying from the first byte in the + * segment. Figure out how many bytes to skip copying + * from the beginning of this segment. + */ + skip_in_seg = uio->uio_loffset - this_seg_start; + + /* + * Calculate the total number of bytes from this + * segment that we will be copying. + */ + copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); + + /* Copy the bytes */ + zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); + p = ((char *)p) + copy_from_seg; + + n -= copy_from_seg; + uio->uio_resid -= copy_from_seg; + uio->uio_loffset += copy_from_seg; + copied = 1; /* We copied some data */ + } + + if (n == 0) { + /* + * All done copying. Save our 'iter' value to the uio. + * This allows us to "save our state" and skip ahead in + * the rq_for_each_segment() loop the next time we call + * call zfs_uiomove_bvec_rq() on this uio (which we + * will be doing for any remaining data in the uio). + */ + uio->iter = iter; /* make a copy of the struct data */ + uio->bv = bv; + return (0); + } + + this_seg_start = this_seg_end + 1; + } + + if (!copied) { + /* Didn't copy anything */ + uio->uio_resid = 0; + } + return (0); +} +#endif + +static inline int +zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ +#ifdef HAVE_BLK_MQ + if (uio->rq != NULL) + return (zfs_uiomove_bvec_rq(p, n, rw, uio)); +#endif + return (zfs_uiomove_bvec_impl(p, n, rw, uio)); +} + #if defined(HAVE_VFS_IOV_ITER) static int zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, @@ -300,8 +437,14 @@ zfs_uioskip(zfs_uio_t *uio, size_t n) { if (n > uio->uio_resid) return; - - if (uio->uio_segflg == UIO_BVEC) { + /* + * When using a uio with a struct request, we simply + * use uio_loffset as a pointer to the next logical byte to + * copy in the request. We don't have to do any fancy + * accounting with uio_bvec/uio_iovcnt since we don't use + * them. + */ + if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { uio->uio_skip += n; while (uio->uio_iovcnt && uio->uio_skip >= uio->uio_bvec->bv_len) { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index f772f416043e..a9606caef48b 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -41,17 +41,80 @@ #include #include -static unsigned int zvol_major = ZVOL_MAJOR; -static unsigned int zvol_request_sync = 0; -static unsigned int zvol_prefetch_bytes = (128 * 1024); -static unsigned long zvol_max_discard_blocks = 16384; -static unsigned int zvol_threads = 32; -static const unsigned int zvol_open_timeout_ms = 1000; +#ifdef HAVE_BLK_MQ +#include +#endif + +static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, + struct request *rq, boolean_t force_sync); + +unsigned int zvol_major = ZVOL_MAJOR; +unsigned int zvol_request_sync = 0; +unsigned int zvol_prefetch_bytes = (128 * 1024); +unsigned long zvol_max_discard_blocks = 16384; +unsigned int zvol_open_timeout_ms = 1000; + +/* + * zvol_threads is the module param the user passes in. + * + * zvol_actual_threads is what we use internally, since the user can pass + * zvol_thread = 0 to mean "use all the CPUs" (the default). So on a quad + * core system, you would have: zvol_threads = 0, zvol_actual_threads = 4. + */ +static unsigned int zvol_threads = 0; +static unsigned int zvol_actual_threads; +#ifdef HAVE_BLK_MQ +static boolean_t zvol_use_blk_mq = B_FALSE; + +/* + * The maximum number of volblocksize blocks to process per thread. Typically, + * write heavy workloads perform better with higher values here, and read + * heavy workloads preform better with lower values, but that's not a hard + * and fast rule. It's basically a knob to tune between "less overhead with + * less parallelism" and "more overhead, but more parallelism". + * + * '8' was chosen as a reasonable, balanced, default based off of sequential + * read and write tests to a zvol in an NVMe pool (with 16 CPUs). + */ +static unsigned int zvol_blk_mq_blocks_per_thread = 8; +#endif + +#ifndef BLKDEV_DEFAULT_RQ +/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ +#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ +#endif + +/* + * Finalize our BIO or request. + */ +#ifdef HAVE_BLK_MQ +#define END_IO(zv, bio, rq, error) do { \ + if (bio) { \ + BIO_END_IO(bio, error); \ + } else { \ + blk_mq_end_request(rq, errno_to_bi_status(error)); \ + } \ +} while (0) +#else +#define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) +#endif + +#ifdef HAVE_BLK_MQ +static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; +static unsigned int zvol_actual_blk_mq_queue_depth; +#endif struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ + +#ifdef HAVE_BLK_MQ + struct blk_mq_tag_set tag_set; +#endif + + /* Set from the global 'zvol_use_blk_mq' at zvol load */ + boolean_t use_blk_mq; }; taskq_t *zvol_taskq; @@ -60,8 +123,14 @@ static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; + struct request *rq; } zv_request_t; +typedef struct zv_work { + struct request *rq; + struct work_struct work; +} zv_work_t; + typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; @@ -83,6 +152,62 @@ zv_request_task_free(zv_request_task_t *task) kmem_free(task, sizeof (*task)); } +#ifdef HAVE_BLK_MQ + +/* + * This is called when a new block multiqueue request comes in. A request + * contains one or more BIOs. + */ +static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + zvol_state_t *zv = rq->q->queuedata; + + /* Tell the kernel that we are starting to process this request */ + blk_mq_start_request(rq); + + if (blk_rq_is_passthrough(rq)) { + /* Skip non filesystem request */ + blk_mq_end_request(rq, BLK_STS_IOERR); + return (BLK_STS_IOERR); + } + + zvol_request_impl(zv, NULL, rq, 0); + + /* Acknowledge to the kernel that we got this request */ + return (BLK_STS_OK); +} + +static struct blk_mq_ops zvol_blk_mq_queue_ops = { + .queue_rq = zvol_mq_queue_rq, +}; + +/* Initialize our blk-mq struct */ +static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) +{ + struct zvol_state_os *zso = zv->zv_zso; + + memset(&zso->tag_set, 0, sizeof (zso->tag_set)); + + /* Initialize tag set. */ + zso->tag_set.ops = &zvol_blk_mq_queue_ops; + zso->tag_set.nr_hw_queues = zvol_actual_threads; + zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; + zso->tag_set.numa_node = NUMA_NO_NODE; + zso->tag_set.cmd_size = 0; + + /* + * We need BLK_MQ_F_BLOCKING here since we do blocking calls in + * zvol_request_impl() + */ + zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + zso->tag_set.driver_data = zv; + + return (blk_mq_alloc_tag_set(&zso->tag_set)); +} +#endif /* HAVE_BLK_MQ */ + /* * Given a path, return TRUE if path is a ZVOL. */ @@ -104,38 +229,47 @@ static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; - - zfs_uio_bvec_init(&uio, bio); - zvol_state_t *zv = zvr->zv; + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time = 0; + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); /* bio marked as FLUSH need to flush before write */ - if (bio_is_flush(bio)) + if (io_is_flush(bio, rq)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ - if (uio.uio_resid == 0) { + if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); - BIO_END_IO(bio, 0); + END_IO(zv, bio, rq, 0); return; } - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; + zfs_uio_bvec_init(&uio, bio, rq); + ssize_t start_resid = uio.uio_resid; - unsigned long start_time; - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); + /* + * With use_blk_mq, accounting is done by blk_mq_start_request() + * and blk_mq_end_request(), so we can skip it here. + */ + if (!zv->zv_zso->use_blk_mq) { + boolean_t acct = blk_queue_io_stat(q); + if (acct) { + start_time = blk_generic_start_io_acct(q, disk, WRITE, + bio); + } + } boolean_t sync = - bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); @@ -177,10 +311,11 @@ zvol_write(zv_request_t *zvr) rw_exit(&zv->zv_suspend_lock); - if (acct) + if (!zv->zv_zso->use_blk_mq) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -195,27 +330,32 @@ static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; - uint64_t start = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); + uint64_t start = io_offset(bio, rq); + uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time = 0; + + boolean_t acct = blk_queue_io_stat(q); ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; - unsigned long start_time; - - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); + if (bio) { + if (acct) { + start_time = blk_generic_start_io_acct(q, disk, WRITE, + bio); + } + } - sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); @@ -228,7 +368,7 @@ zvol_discard(zv_request_t *zvr) * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ - if (!bio_is_secure_erase(bio)) { + if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; @@ -259,10 +399,14 @@ zvol_discard(zv_request_t *zvr) unlock: rw_exit(&zv->zv_suspend_lock); - if (acct) - blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + if (bio) { + if (acct) { + blk_generic_end_io_acct(q, disk, WRITE, bio, + start_time); + } + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -277,28 +421,38 @@ static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; + boolean_t acct = 0; + zvol_state_t *zv = zvr->zv; + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time = 0; - zfs_uio_bvec_init(&uio, bio); + zfs_uio_bvec_init(&uio, bio, rq); - zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; - unsigned long start_time; - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, READ, bio); + /* + * When blk-mq is being used, accounting is done by + * blk_mq_start_request() and blk_mq_end_request(). + */ + if (!zv->zv_zso->use_blk_mq) { + acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, READ, + bio); + } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); @@ -322,10 +476,12 @@ zvol_read(zv_request_t *zvr) rw_exit(&zv->zv_suspend_lock); - if (acct) - blk_generic_end_io_acct(q, disk, READ, bio, start_time); + if (!zv->zv_zso->use_blk_mq) { + if (acct) + blk_generic_end_io_acct(q, disk, READ, bio, start_time); + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -336,55 +492,45 @@ zvol_read_task(void *arg) zv_request_task_free(task); } -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID + +/* + * Process a BIO + * + * force_sync: Set to 0 to defer processing the BIO to a background taskq + * Set to 1 to process the BIO right now. + */ static void -zvol_submit_bio(struct bio *bio) -#else -static blk_qc_t -zvol_submit_bio(struct bio *bio) -#endif -#else -static MAKE_REQUEST_FN_RET -zvol_request(struct request_queue *q, struct bio *bio) -#endif +zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, + boolean_t force_sync) { -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#if defined(HAVE_BIO_BDEV_DISK) - struct request_queue *q = bio->bi_bdev->bd_disk->queue; -#else - struct request_queue *q = bio->bi_disk->queue; -#endif -#endif - zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); - uint64_t offset = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); - int rw = bio_data_dir(bio); + uint64_t offset = io_offset(bio, rq); + uint64_t size = io_size(bio, rq); + int rw = io_data_dir(bio, rq); + + if (zvol_request_sync) { + force_sync = 1; + } + + zv_request_t zvr = { + .zv = zv, + .bio = bio, + .rq = rq, + }; - if (bio_has_data(bio) && offset + size > zv->zv_volsize) { - printk(KERN_INFO - "%s: bad access: offset=%llu, size=%lu\n", + if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { + printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); - BIO_END_IO(bio, -SET_ERROR(EIO)); + END_IO(zv, bio, rq, -SET_ERROR(EIO)); goto out; } - zv_request_t zvr = { - .zv = zv, - .bio = bio, - }; zv_request_task_t *task; if (rw == WRITE) { - if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - BIO_END_IO(bio, -SET_ERROR(EROFS)); - goto out; - } - /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o @@ -418,7 +564,7 @@ zvol_request(struct request_queue *q, struct bio *bio) * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is - * complete by calling BIO_END_IO() from the taskq callback. + * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling @@ -438,12 +584,12 @@ zvol_request(struct request_queue *q, struct bio *bio) * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, - * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL + * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { - if (zvol_request_sync) { + if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); @@ -451,7 +597,7 @@ zvol_request(struct request_queue *q, struct bio *bio) zvol_discard_task, task, 0, &task->ent); } } else { - if (zvol_request_sync) { + if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); @@ -466,14 +612,14 @@ zvol_request(struct request_queue *q, struct bio *bio) * data and require no additional handling. */ if (size == 0) { - BIO_END_IO(bio, 0); + END_IO(zv, bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ - if (zvol_request_sync) { + if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); @@ -484,8 +630,33 @@ zvol_request(struct request_queue *q, struct bio *bio) out: spl_fstrans_unmark(cookie); -#if (defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ - defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)) && \ +} + +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID +static void +zvol_submit_bio(struct bio *bio) +#else +static blk_qc_t +zvol_submit_bio(struct bio *bio) +#endif +#else +static MAKE_REQUEST_FN_RET +zvol_request(struct request_queue *q, struct bio *bio) +#endif +{ +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#if defined(HAVE_BIO_BDEV_DISK) + struct request_queue *q = bio->bi_bdev->bd_disk->queue; +#else + struct request_queue *q = bio->bi_disk->queue; +#endif +#endif + zvol_state_t *zv = q->queuedata; + + zvol_request_impl(zv, bio, NULL, 0); +#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ + defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif @@ -802,6 +973,27 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } +/* + * Why have two separate block_device_operations structs? + * + * Normally we'd just have one, and assign 'submit_bio' as needed. However, + * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we + * can't just change submit_bio dynamically at runtime. So just create two + * separate structs to get around this. + */ +static const struct block_device_operations zvol_ops_blk_mq = { + .open = zvol_open, + .release = zvol_release, + .ioctl = zvol_ioctl, + .compat_ioctl = zvol_compat_ioctl, + .check_events = zvol_check_events, +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK + .revalidate_disk = zvol_revalidate_disk, +#endif + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +}; + static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, @@ -818,6 +1010,87 @@ static const struct block_device_operations zvol_ops = { #endif }; +static int +zvol_alloc_non_blk_mq(struct zvol_state_os *zso) +{ +#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) +#if defined(HAVE_BLK_ALLOC_DISK) + zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); + if (zso->zvo_disk == NULL) + return (1); + + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; +#else + zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + return (1); + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_BLK_ALLOC_DISK */ +#else + zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + return (1); + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ + return (0); + +} + +static int +zvol_alloc_blk_mq(zvol_state_t *zv) +{ +#ifdef HAVE_BLK_MQ + struct zvol_state_os *zso = zv->zv_zso; + + /* Allocate our blk-mq tag_set */ + if (zvol_blk_mq_alloc_tag_set(zv) != 0) + return (1); + +#if defined(HAVE_BLK_ALLOC_DISK) + zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); + if (zso->zvo_disk == NULL) + return (1); + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; +#else + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + /* Allocate queue */ + zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); + if (IS_ERR(zso->zvo_queue)) { + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + + /* Our queue is now created, assign it to our disk */ + zso->zvo_disk->queue = zso->zvo_queue; + +#endif + + /* Finish blk-mq init */ + blk_queue_logical_block_size(zso->zvo_queue, 512); +#endif + return (0); +} + /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. @@ -828,6 +1101,7 @@ zvol_alloc(dev_t dev, const char *name) zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; + int ret; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); @@ -846,48 +1120,46 @@ zvol_alloc(dev_t dev, const char *name) list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#ifdef HAVE_BLK_ALLOC_DISK - zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); - if (zso->zvo_disk == NULL) - goto out_kmem; - - zso->zvo_disk->minors = ZVOL_MINORS; - zso->zvo_queue = zso->zvo_disk->queue; +#ifdef HAVE_BLK_MQ + zv->zv_zso->use_blk_mq = zvol_use_blk_mq; #else - zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); - if (zso->zvo_queue == NULL) - goto out_kmem; + zv->zv_zso->use_blk_mq = 0; +#endif - zso->zvo_disk = alloc_disk(ZVOL_MINORS); - if (zso->zvo_disk == NULL) { - blk_cleanup_queue(zso->zvo_queue); - goto out_kmem; + /* + * The block layer has 3 interfaces for getting BIOs: + * + * 1. blk-mq request queues (new) + * 2. submit_bio() (oldest) + * 3. regular request queues (old). + * + * Each of those interfaces has two permutations: + * + * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates + * both the disk and its queue (5.14 kernel or newer) + * + * b) We don't have blk_*alloc_disk(), and have to allocate the + * disk and the queue separately. (5.13 kernel or older) + */ + if (zv->zv_zso->use_blk_mq) { + ret = zvol_alloc_blk_mq(zv); + zso->zvo_disk->fops = &zvol_ops_blk_mq; + } else { + ret = zvol_alloc_non_blk_mq(zso); + zso->zvo_disk->fops = &zvol_ops; } - - zso->zvo_disk->queue = zso->zvo_queue; -#endif /* HAVE_BLK_ALLOC_DISK */ -#else - zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); - if (zso->zvo_queue == NULL) - goto out_kmem; - - zso->zvo_disk = alloc_disk(ZVOL_MINORS); - if (zso->zvo_disk == NULL) { - blk_cleanup_queue(zso->zvo_queue); + if (ret != 0) goto out_kmem; - } - - zso->zvo_disk->queue = zso->zvo_queue; -#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); - /* Disable write merging in favor of the ZIO pipeline. */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + if (!zv->zv_zso->use_blk_mq) { + /* Disable write merging in favor of the ZIO pipeline. */ + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + } /* Enable /proc/diskstats */ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); @@ -920,7 +1192,6 @@ zvol_alloc(dev_t dev, const char *name) #endif } zso->zvo_disk->first_minor = (dev & MINORMASK); - zso->zvo_disk->fops = &zvol_ops; zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); @@ -953,6 +1224,11 @@ zvol_os_free(zvol_state_t *zv) ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); +#ifdef HAVE_BLK_MQ + if (zv->zv_zso->use_blk_mq) + flush_scheduled_work(); +#endif + rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); @@ -965,6 +1241,11 @@ zvol_os_free(zvol_state_t *zv) put_disk(zv->zv_zso->zvo_disk); #endif +#ifdef HAVE_BLK_MQ + if (zv->zv_zso->use_blk_mq) + blk_mq_free_tag_set(&zv->zv_zso->tag_set); +#endif + ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); @@ -1046,8 +1327,69 @@ zvol_os_create_minor(const char *name) blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); - blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + + if (zv->zv_zso->use_blk_mq) { + /* + * IO requests can be really big (1MB). When an IO request + * comes * in, it is passed off to zvol_read() or zvol_write() + * in a new thread, where it is chunked up into 'volblocksize' + * sized pieces and processed. So for example, if the request + * is a 1MB write and your volblocksize is 128k, one zvol_write + * thread will * take that request and sequentially do ten 128k + * IOs. This is due to the fact that the thread needs to lock + * each volblocksize sized block. So you might be wondering: + * "instead of passing the whole 1MB request to one thread, + * why not pass ten individual 128k chunks to ten threads and + * process the whole write in parallel?" The short answer is + * that there's a sweet spot number of chunks that balances + * the greater parallelism with the added overhead of more + * threads. The sweet spot can be different depending on if you + * have a read or write heavy workload. Writes typically want + * high chunk counts while reads typically want lower ones. On + * a test pool with 6 NVMe drives in a 3x 2-disk mirror + * configuration, with volblocksize=8k, the sweet spot for good + * sequential reads and writes was at 8 chunks. + */ + + /* + * Below we tell the kernel how big we want our requests + * to be. You would think that blk_queue_io_opt() would be + * used to do this since it is used to "set optimal request + * size for the queue", but that doesn't seem to do + * anything - the kernel still gives you huge requests + * with tons of little PAGE_SIZE segments contained within it. + * + * Knowing that the kernel will just give you PAGE_SIZE segments + * no matter what, you can say "ok, I want PAGE_SIZE byte + * segments, and I want 'N' of them per request", where N is + * the correct number of segments for the volblocksize and + * number of chunks you want. + */ +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_blocks_per_thread != 0) { + unsigned int chunks; + chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); + + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, + PAGE_SIZE); + blk_queue_max_segments(zv->zv_zso->zvo_queue, + (zv->zv_volblocksize * chunks) / PAGE_SIZE); + } else { + /* + * Special case: zvol_blk_mq_blocks_per_thread = 0 + * Max everything out. + */ + blk_queue_max_segments(zv->zv_zso->zvo_queue, + UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, + UINT_MAX); + } +#endif + } else { + blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + } + blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); @@ -1167,19 +1509,36 @@ int zvol_init(void) { int error; - int threads = MIN(MAX(zvol_threads, 1), 1024); + + if (zvol_threads == 0) { + zvol_actual_threads = num_online_cpus(); + } else { + zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); + } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } - zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, - threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_queue_depth == 0) { + zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; + } else { + zvol_actual_blk_mq_queue_depth = + MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); + } +#endif + /* We're not using blk-mq so setup taskqueues */ + zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, + zvol_actual_threads, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } + zvol_init_impl(); ida_init(&zvol_ida); return (0); @@ -1202,7 +1561,8 @@ module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); -MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); +MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" + "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); @@ -1215,4 +1575,17 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); + +#ifdef HAVE_BLK_MQ +module_param(zvol_blk_mq_queue_depth, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); + +module_param(zvol_use_blk_mq, uint, 0644); +MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); + +module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, + "Process volblocksize blocks per thread"); +#endif + /* END CSTYLED */ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a7ddb146e59b..2b846a6e66fb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -941,6 +941,10 @@ tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] +[tests/functional/zvol/zvol_stress] +tests = ['zvol_stress'] +tags = ['functional', 'zvol', 'zvol_stress'] + [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index b229a161518b..ddf9d349c5ff 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3334,18 +3334,22 @@ function is_te_enabled fi } -# Utility function to determine if a system has multiple cpus. -function is_mp +# Return the number of CPUs (cross-platform) +function get_num_cpus { - if is_linux; then - (($(nproc) > 1)) + if is_linux ; then + nproc elif is_freebsd; then sysctl -n kern.smp.cpus else - (($(psrinfo | wc -l) > 1)) + psrinfo | wc -l fi +} - return $? +# Utility function to determine if a system has multiple cpus. +function is_mp +{ + [[ $(get_num_cpus) -gt 1 ]] } function get_cpu_freq @@ -3888,14 +3892,23 @@ function get_tunable_impl { typeset name="$1" typeset module="${2:-zfs}" + typeset check_only="$3" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) - log_unsupported "Tunable '$name' is unsupported on $(uname)" + if [ -z "$check_only" ] ; then + log_unsupported "Tunable '$name' is unsupported on $(uname)" + else + return 1 + fi ;; "") - log_fail "Tunable '$name' must be added to tunables.cfg" + if [ -z "$check_only" ] ; then + log_fail "Tunable '$name' must be added to tunables.cfg" + else + return 1 + fi ;; *) ;; @@ -3919,6 +3932,14 @@ function get_tunable_impl return 1 } +# Does a tunable exist? +# +# $1: Tunable name +function tunable_exists +{ + get_tunable_impl $1 "zfs" 1 +} + # # Prints the current time in seconds since UNIX Epoch. # diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index eea2af2edcf0..fcacf519ce44 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -87,6 +87,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED +VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/functional/zvol/Makefile.am b/tests/zfs-tests/tests/functional/zvol/Makefile.am index e4910754bb81..9089a939abb0 100644 --- a/tests/zfs-tests/tests/functional/zvol/Makefile.am +++ b/tests/zfs-tests/tests/functional/zvol/Makefile.am @@ -5,6 +5,7 @@ dist_pkgdata_DATA = \ SUBDIRS = \ zvol_ENOSPC \ + zvol_stress \ zvol_cli \ zvol_misc \ zvol_swap diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am b/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am new file mode 100644 index 000000000000..5ccd0c7b5619 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am @@ -0,0 +1,5 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/zvol/zvol_stress +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh \ + zvol_stress.ksh diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh new file mode 100755 index 000000000000..b81a372638e3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh new file mode 100755 index 000000000000..746ac307a755 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh @@ -0,0 +1,38 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_setup "$DISKS" + +log_pass diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh new file mode 100755 index 000000000000..94d3717c42af --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -0,0 +1,171 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2022 by Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/reservation/reservation.shlib + +# +# DESCRIPTION: +# Stress test multithreaded transfers to multiple zvols. Also verify +# zvol errors show up in zpool status. +# +# STRATEGY: +# +# For both the normal submit_bio() codepath and the blk-mq codepath, do +# the following: +# +# 1. Create one zvol per CPU +# 2. In parallel, spawn an fio "write and verify" for each zvol +# 3. Inject write errors +# 4. Write to one of the zvols with dd and verify the errors +# + +verify_runnable "global" + +num_zvols=$(get_num_cpus) + +# If we were making one big zvol from all the pool space, it would +# be this big: +biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL) + +# Crude calculation: take the biggest zvol size we could possibly +# create, knock 10% off it (for overhead) and divide by the number +# of ZVOLs we want to make. +each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9) / $num_zvols )) + +typeset tmpdir="$(mktemp -d zvol_stress_fio_state.XXXXXX)" + +function create_zvols +{ + log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each" + for i in $(seq $num_zvols) ; do + log_must zfs create -V $each_zvol_size $TESTPOOL/testvol$i + block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/testvol$i" + done +} + +function destroy_zvols +{ + for i in $(seq $num_zvols) ; do + log_must_busy zfs destroy $TESTPOOL/testvol$i + done +} + +# enable/disable blk-mq (if available) +# +# $1: 1 = enable, 0 = disable +function set_blk_mq +{ + # Not all kernels support blk-mq + if tunable_exists VOL_USE_BLK_MQ ; then + log_must set_tunable32 VOL_USE_BLK_MQ $1 + fi +} + +function do_zvol_stress +{ + # Write 10% of each zvol, or 50MB, whichever is less + zvol_write_size=$((each_zvol_size / 10)) + if [ $zvol_write_size -gt $((50 * 1048576)) ] ; then + zvol_write_size=$((50 * 1048576)) + fi + zvol_write_size_mb=$(($zvol_write_size / 1048576)) + + if is_linux ; then + engine=libaio + else + engine=psync + fi + + # Spawn off one fio per zvol in parallel + pids="" + for i in $(seq $num_zvols) ; do + # Spawn one fio per zvol as its own process + fio --ioengine=$engine --name=zvol_stress$i --direct=0 \ + --filename="$ZVOL_DEVDIR/$TESTPOOL/testvol$i" --bs=1048576 \ + --iodepth=10 --readwrite=randwrite --size=${zvol_write_size} \ + --verify_async=2 --numjobs=1 --verify=sha1 \ + --verify_fatal=1 \ + --continue_on_error=none \ + --error_dump=1 \ + --exitall_on_error \ + --aux-path="$tmpdir" --do_verify=1 & + pids="$pids $!" + done + + # Wait for all the spawned fios to finish and look for errors + fail="" + i=0 + for pid in $pids ; do + log_note "$s waiting on $pid" + if ! wait $pid ; then + log_fail "fio error on $TESTPOOL/testvol$i" + fi + i=$(($i + 1)) + done +} + +function cleanup +{ + log_must zinject -c all + log_must zpool clear $TESTPOOL + destroy_zvols + set_blk_mq 0 + + # Remove all fio's leftover state files + if [ -n "$tmpdir" ] ; then + rm -f "$tmpdir"/*.state + rmdir "$tmpdir" + fi +} + +log_onexit cleanup + +log_assert "Stress test zvols" + +set_blk_mq 0 +create_zvols +# Do some fio write/verifies in parallel +do_zvol_stress +destroy_zvols + +# Enable blk-mq (block multi-queue), and re-run the same test +set_blk_mq 1 +create_zvols +do_zvol_stress + +# Inject some errors, and verify we see some IO errors in zpool status +for DISK in $DISKS ; do + log_must zinject -d $DISK -f 10 -e io -T write $TESTPOOL +done +log_must dd if=/dev/zero of=$ZVOL_DEVDIR/$TESTPOOL/testvol1 bs=512 count=50 +log_must zinject -c all + +log_must zpool status +write_errors=$(zpool status -pv | grep $DISK | awk '{print $4}') +if [ $write_errors -le 0 ] ; then + log_fail "Expected to see some write errors (saw $write_errors)" +else + log_note "Correctly saw $write_errors write errors" +fi +log_pass "Done with zvol_stress"