diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index 559ae9800e8f..907a2c48ce67 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -315,6 +315,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ ]) ]) +dnl # +dnl # See if kernel supports block multi-queue and blk_status_t. +dnl # blk_status_t represents the new status codes introduced in the 4.13 +dnl # kernel patch: +dnl # +dnl # block: introduce new block status code type +dnl # +dnl # We do not currently support the "old" block multi-queue interfaces from +dnl # prior kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [ + ZFS_LINUX_TEST_SRC([blk_mq], [ + #include + ], [ + struct blk_mq_tag_set tag_set = {0}; + (void) blk_mq_alloc_tag_set(&tag_set); + return BLK_STS_OK; + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ + AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available]) + ZFS_LINUX_TEST_RESULT([blk_mq], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI @@ -326,6 +356,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_SRC_BLK_MQ ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ @@ -339,4 +370,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ ZFS_AC_KERNEL_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_BLK_MQ ]) diff --git a/configure.ac b/configure.ac index 7037c06b225f..dc711fe7b6fe 100644 --- a/configure.ac +++ b/configure.ac @@ -403,6 +403,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/zpool_influxdb/Makefile tests/zfs-tests/tests/functional/zvol/Makefile tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 9fa8884bb7a1..6d9c6c0f1a48 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -34,6 +34,7 @@ #include #include #include /* for SECTOR_* */ +#include #ifndef HAVE_BLK_QUEUE_FLAG_SET static inline void @@ -579,4 +580,90 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ +static inline int +io_data_dir(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) { + enum req_opf op = req_op(rq); + if (op_is_write(op)) { + return (WRITE); + } else { + return (READ); + } + } +#endif + return (bio_data_dir(bio)); +} + +static inline int +io_is_flush(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_FLUSH); +#endif + return (bio_is_flush(bio)); +} + +static inline int +io_is_discard(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_DISCARD); +#endif + return (bio_is_discard(bio)); +} + +static inline int +io_is_secure_erase(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (req_op(rq) == REQ_OP_SECURE_ERASE); +#endif + return (bio_is_secure_erase(bio)); +} + +static inline int +io_is_fua(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (rq->cmd_flags & REQ_FUA); +#endif + return (bio_is_fua(bio)); +} + + +static inline uint64_t +io_offset(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (blk_rq_pos(rq) << 9); +#endif + return (BIO_BI_SECTOR(bio) << 9); +} + +static inline uint64_t +io_size(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (blk_rq_bytes(rq)); +#endif + return (BIO_BI_SIZE(bio)); +} + +static inline int +io_has_data(struct bio *bio, struct request *rq) +{ +#ifdef HAVE_BLK_MQ + if (rq != NULL) + return (bio_has_data(rq->bio)); +#endif + return (bio_has_data(bio)); +} #endif /* _ZFS_BLKDEV_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 439eec986236..ed13a4398b25 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -68,10 +68,20 @@ typedef struct zfs_uio { boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; - ssize_t uio_resid; + ssize_t uio_resid; size_t uio_skip; + + struct request *rq; + + /* + * Used for saving rq_for_each_segment() state between calls + * to zfs_uiomove_bvec_rq(). + */ + struct req_iterator iter; + struct bio_vec bv; } zfs_uio_t; + #define zfs_uio_segflg(u) (u)->uio_segflg #define zfs_uio_offset(u) (u)->uio_loffset #define zfs_uio_resid(u) (u)->uio_resid @@ -116,17 +126,30 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, } static inline void -zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio) +zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) { - uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; - uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); - uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; + if (bio) { + uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); + uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; + } else { + uio->uio_bvec = NULL; + uio->uio_iovcnt = 0; + memset(&uio->iter, 0, sizeof (uio->iter)); + } + + uio->uio_loffset = io_offset(bio, rq); uio->uio_segflg = UIO_BVEC; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; - uio->uio_resid = BIO_BI_SIZE(bio); - uio->uio_skip = BIO_BI_SKIP(bio); + uio->uio_resid = io_size(bio, rq); + if (bio) { + uio->uio_skip = BIO_BI_SKIP(bio); + } else { + uio->uio_skip = 0; + } + + uio->rq = rq; } #if defined(HAVE_VFS_IOV_ITER) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index bb3cd2243ad3..eed90fbd2f51 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2207,9 +2207,67 @@ for each I/O submitter. When unset, requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controlled by .Sy zvol_threads . -. -.It Sy zvol_threads Ns = Ns Sy 32 Pq uint -Max number of threads which can handle zvol I/O requests concurrently. +.Sy zvol_request_sync +is ignored when running on a kernel that supports block multiqueue +.Pq Li blk-mq . +. +.It Sy zvol_threads Ns = Ns Sy 0 Pq uint +The number of threads to use for processing zvol block IOs. +On older +.No non- Ns Li blk-mq +kernels, +.Sy zvol_threads +is the total number of threads to use for all zvols. +On kernels that support +.Li blk-mq +.Sy zvol_threads +is the total number of threads per zvol. +If +.Sy 0 +(the default) then internally set +.Sy zvol_threads +to the number of CPUs present. +.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Set to +.Sy 1 +to use the +.Li blk-mq +API for zvols. +Set to +.Sy 0 +(the default) to use the legacy zvol APIs. +This setting can give better or worse zvol performance depending on +the workload. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint +If +.Sy zvol_use_blk_mq +is enabled, then process this number of volblocksize blocks per zvol thread. +This tunable can be use to favor better performance for zvol reads (lower +values) or writes (higher values). +If set to 0, then the zvol layer will process the maximum number of blocks +per thread that it can. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint +The queue_depth value for the zvol +.Li blk-mq +interface. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read at zvol load time. +If +.Sy 0 +(the default) then use the kernel's default queue depth. +If you set +.Sy zvol_blk_mq_queue_depth +lower than the kernel's minimum queue depth, it will be internally +capped to the kernel's minimum queue depth (currently 4 on 5.15 kernels). . .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index a3d5d5f83b6f..ece9cfe8dfdb 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -47,6 +47,7 @@ #include #include #include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -126,7 +127,7 @@ zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) } static int -zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { const struct bio_vec *bv = uio->uio_bvec; size_t skip = uio->uio_skip; @@ -137,10 +138,13 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) cnt = MIN(bv->bv_len - skip, n); paddr = zfs_kmap_atomic(bv->bv_page); - if (rw == UIO_READ) + if (rw == UIO_READ) { + /* Copy from buffer 'p' to the bvec data */ bcopy(p, paddr + bv->bv_offset + skip, cnt); - else + } else { + /* Copy from bvec data to buffer 'p' */ bcopy(paddr + bv->bv_offset + skip, p, cnt); + } zfs_kunmap_atomic(paddr); skip += cnt; @@ -158,6 +162,139 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) return (0); } +#ifdef HAVE_BLK_MQ +static void +zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, + struct bio_vec *bv) +{ + void *paddr; + + paddr = zfs_kmap_atomic(bv->bv_page); + if (rw == UIO_READ) { + /* Copy from buffer 'p' to the bvec data */ + bcopy(p, paddr + bv->bv_offset + skip, cnt); + } else { + /* Copy from bvec data to buffer 'p' */ + bcopy(paddr + bv->bv_offset + skip, p, cnt); + } + zfs_kunmap_atomic(paddr); +} + +/* + * Copy 'n' bytes of data between the buffer p[] and the data represented + * by the request in the uio. + */ +static int +zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ + struct request *rq = uio->rq; + struct bio_vec bv; + struct req_iterator iter; + size_t this_seg_start; /* logical offset */ + size_t this_seg_end; /* logical offset */ + size_t skip_in_seg; + size_t copy_from_seg; + size_t orig_loffset; + int copied = 0; + + /* + * Get the original logical offset of this entire request (because + * uio->uio_loffset will be modified over time). + */ + orig_loffset = io_offset(NULL, rq); + this_seg_start = orig_loffset; + + rq_for_each_segment(bv, rq, iter) { + if (uio->iter.bio) { + /* + * If uio->iter.bio is present, then we know we've saved + * uio->iter from a previous call to this function, and + * we can skip ahead in this rq_for_each_segment() loop + * to where we last left off. That way, we don't need + * to iterate over tons of segments we've already + * processed - we can just restore the "saved state". + */ + iter = uio->iter; + bv = uio->bv; + this_seg_start = uio->uio_loffset; + memset(&uio->iter, 0, sizeof (uio->iter)); + continue; + } + + /* + * Lookup what the logical offset of the last byte of this + * segment is. + */ + this_seg_end = this_seg_start + bv.bv_len - 1; + + /* + * We only need to operate on segments that have data we're + * copying. + */ + if (uio->uio_loffset >= this_seg_start && + uio->uio_loffset <= this_seg_end) { + /* + * Some, or all, of the data in this segment needs to be + * copied. + */ + + /* + * We may be not be copying from the first byte in the + * segment. Figure out how many bytes to skip copying + * from the beginning of this segment. + */ + skip_in_seg = uio->uio_loffset - this_seg_start; + + /* + * Calculate the total number of bytes from this + * segment that we will be copying. + */ + copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); + + /* Copy the bytes */ + zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); + p = ((char *)p) + copy_from_seg; + + n -= copy_from_seg; + uio->uio_resid -= copy_from_seg; + uio->uio_loffset += copy_from_seg; + copied = 1; /* We copied some data */ + } + + if (n == 0) { + /* + * All done copying. Save our 'iter' value to the uio. + * This allows us to "save our state" and skip ahead in + * the rq_for_each_segment() loop the next time we call + * call zfs_uiomove_bvec_rq() on this uio (which we + * will be doing for any remaining data in the uio). + */ + uio->iter = iter; /* make a copy of the struct data */ + uio->bv = bv; + return (0); + } + + this_seg_start = this_seg_end + 1; + } + + if (!copied) { + /* Didn't copy anything */ + uio->uio_resid = 0; + } + return (0); +} +#endif + +static inline int +zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ +#ifdef HAVE_BLK_MQ + if (uio->rq != NULL) + return (zfs_uiomove_bvec_rq(p, n, rw, uio)); +#endif + return (zfs_uiomove_bvec_impl(p, n, rw, uio)); +} + #if defined(HAVE_VFS_IOV_ITER) static int zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, @@ -300,8 +437,14 @@ zfs_uioskip(zfs_uio_t *uio, size_t n) { if (n > uio->uio_resid) return; - - if (uio->uio_segflg == UIO_BVEC) { + /* + * When using a uio with a struct request, we simply + * use uio_loffset as a pointer to the next logical byte to + * copy in the request. We don't have to do any fancy + * accounting with uio_bvec/uio_iovcnt since we don't use + * them. + */ + if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { uio->uio_skip += n; while (uio->uio_iovcnt && uio->uio_skip >= uio->uio_bvec->bv_len) { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index f772f416043e..a9606caef48b 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -41,17 +41,80 @@ #include #include -static unsigned int zvol_major = ZVOL_MAJOR; -static unsigned int zvol_request_sync = 0; -static unsigned int zvol_prefetch_bytes = (128 * 1024); -static unsigned long zvol_max_discard_blocks = 16384; -static unsigned int zvol_threads = 32; -static const unsigned int zvol_open_timeout_ms = 1000; +#ifdef HAVE_BLK_MQ +#include +#endif + +static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, + struct request *rq, boolean_t force_sync); + +unsigned int zvol_major = ZVOL_MAJOR; +unsigned int zvol_request_sync = 0; +unsigned int zvol_prefetch_bytes = (128 * 1024); +unsigned long zvol_max_discard_blocks = 16384; +unsigned int zvol_open_timeout_ms = 1000; + +/* + * zvol_threads is the module param the user passes in. + * + * zvol_actual_threads is what we use internally, since the user can pass + * zvol_thread = 0 to mean "use all the CPUs" (the default). So on a quad + * core system, you would have: zvol_threads = 0, zvol_actual_threads = 4. + */ +static unsigned int zvol_threads = 0; +static unsigned int zvol_actual_threads; +#ifdef HAVE_BLK_MQ +static boolean_t zvol_use_blk_mq = B_FALSE; + +/* + * The maximum number of volblocksize blocks to process per thread. Typically, + * write heavy workloads perform better with higher values here, and read + * heavy workloads preform better with lower values, but that's not a hard + * and fast rule. It's basically a knob to tune between "less overhead with + * less parallelism" and "more overhead, but more parallelism". + * + * '8' was chosen as a reasonable, balanced, default based off of sequential + * read and write tests to a zvol in an NVMe pool (with 16 CPUs). + */ +static unsigned int zvol_blk_mq_blocks_per_thread = 8; +#endif + +#ifndef BLKDEV_DEFAULT_RQ +/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ +#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ +#endif + +/* + * Finalize our BIO or request. + */ +#ifdef HAVE_BLK_MQ +#define END_IO(zv, bio, rq, error) do { \ + if (bio) { \ + BIO_END_IO(bio, error); \ + } else { \ + blk_mq_end_request(rq, errno_to_bi_status(error)); \ + } \ +} while (0) +#else +#define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) +#endif + +#ifdef HAVE_BLK_MQ +static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; +static unsigned int zvol_actual_blk_mq_queue_depth; +#endif struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ + +#ifdef HAVE_BLK_MQ + struct blk_mq_tag_set tag_set; +#endif + + /* Set from the global 'zvol_use_blk_mq' at zvol load */ + boolean_t use_blk_mq; }; taskq_t *zvol_taskq; @@ -60,8 +123,14 @@ static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; + struct request *rq; } zv_request_t; +typedef struct zv_work { + struct request *rq; + struct work_struct work; +} zv_work_t; + typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; @@ -83,6 +152,62 @@ zv_request_task_free(zv_request_task_t *task) kmem_free(task, sizeof (*task)); } +#ifdef HAVE_BLK_MQ + +/* + * This is called when a new block multiqueue request comes in. A request + * contains one or more BIOs. + */ +static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + zvol_state_t *zv = rq->q->queuedata; + + /* Tell the kernel that we are starting to process this request */ + blk_mq_start_request(rq); + + if (blk_rq_is_passthrough(rq)) { + /* Skip non filesystem request */ + blk_mq_end_request(rq, BLK_STS_IOERR); + return (BLK_STS_IOERR); + } + + zvol_request_impl(zv, NULL, rq, 0); + + /* Acknowledge to the kernel that we got this request */ + return (BLK_STS_OK); +} + +static struct blk_mq_ops zvol_blk_mq_queue_ops = { + .queue_rq = zvol_mq_queue_rq, +}; + +/* Initialize our blk-mq struct */ +static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) +{ + struct zvol_state_os *zso = zv->zv_zso; + + memset(&zso->tag_set, 0, sizeof (zso->tag_set)); + + /* Initialize tag set. */ + zso->tag_set.ops = &zvol_blk_mq_queue_ops; + zso->tag_set.nr_hw_queues = zvol_actual_threads; + zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; + zso->tag_set.numa_node = NUMA_NO_NODE; + zso->tag_set.cmd_size = 0; + + /* + * We need BLK_MQ_F_BLOCKING here since we do blocking calls in + * zvol_request_impl() + */ + zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + zso->tag_set.driver_data = zv; + + return (blk_mq_alloc_tag_set(&zso->tag_set)); +} +#endif /* HAVE_BLK_MQ */ + /* * Given a path, return TRUE if path is a ZVOL. */ @@ -104,38 +229,47 @@ static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; - - zfs_uio_bvec_init(&uio, bio); - zvol_state_t *zv = zvr->zv; + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time = 0; + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); /* bio marked as FLUSH need to flush before write */ - if (bio_is_flush(bio)) + if (io_is_flush(bio, rq)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ - if (uio.uio_resid == 0) { + if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); - BIO_END_IO(bio, 0); + END_IO(zv, bio, rq, 0); return; } - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; + zfs_uio_bvec_init(&uio, bio, rq); + ssize_t start_resid = uio.uio_resid; - unsigned long start_time; - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); + /* + * With use_blk_mq, accounting is done by blk_mq_start_request() + * and blk_mq_end_request(), so we can skip it here. + */ + if (!zv->zv_zso->use_blk_mq) { + boolean_t acct = blk_queue_io_stat(q); + if (acct) { + start_time = blk_generic_start_io_acct(q, disk, WRITE, + bio); + } + } boolean_t sync = - bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); @@ -177,10 +311,11 @@ zvol_write(zv_request_t *zvr) rw_exit(&zv->zv_suspend_lock); - if (acct) + if (!zv->zv_zso->use_blk_mq) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -195,27 +330,32 @@ static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; - uint64_t start = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); + uint64_t start = io_offset(bio, rq); + uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time = 0; + + boolean_t acct = blk_queue_io_stat(q); ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; - unsigned long start_time; - - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); + if (bio) { + if (acct) { + start_time = blk_generic_start_io_acct(q, disk, WRITE, + bio); + } + } - sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); @@ -228,7 +368,7 @@ zvol_discard(zv_request_t *zvr) * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ - if (!bio_is_secure_erase(bio)) { + if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; @@ -259,10 +399,14 @@ zvol_discard(zv_request_t *zvr) unlock: rw_exit(&zv->zv_suspend_lock); - if (acct) - blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + if (bio) { + if (acct) { + blk_generic_end_io_acct(q, disk, WRITE, bio, + start_time); + } + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -277,28 +421,38 @@ static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; + struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; + boolean_t acct = 0; + zvol_state_t *zv = zvr->zv; + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time = 0; - zfs_uio_bvec_init(&uio, bio); + zfs_uio_bvec_init(&uio, bio, rq); - zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); - struct request_queue *q = zv->zv_zso->zvo_queue; - struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; - unsigned long start_time; - boolean_t acct = blk_queue_io_stat(q); - if (acct) - start_time = blk_generic_start_io_acct(q, disk, READ, bio); + /* + * When blk-mq is being used, accounting is done by + * blk_mq_start_request() and blk_mq_end_request(). + */ + if (!zv->zv_zso->use_blk_mq) { + acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, READ, + bio); + } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); @@ -322,10 +476,12 @@ zvol_read(zv_request_t *zvr) rw_exit(&zv->zv_suspend_lock); - if (acct) - blk_generic_end_io_acct(q, disk, READ, bio, start_time); + if (!zv->zv_zso->use_blk_mq) { + if (acct) + blk_generic_end_io_acct(q, disk, READ, bio, start_time); + } - BIO_END_IO(bio, -error); + END_IO(zv, bio, rq, -error); } static void @@ -336,55 +492,45 @@ zvol_read_task(void *arg) zv_request_task_free(task); } -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID + +/* + * Process a BIO + * + * force_sync: Set to 0 to defer processing the BIO to a background taskq + * Set to 1 to process the BIO right now. + */ static void -zvol_submit_bio(struct bio *bio) -#else -static blk_qc_t -zvol_submit_bio(struct bio *bio) -#endif -#else -static MAKE_REQUEST_FN_RET -zvol_request(struct request_queue *q, struct bio *bio) -#endif +zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, + boolean_t force_sync) { -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#if defined(HAVE_BIO_BDEV_DISK) - struct request_queue *q = bio->bi_bdev->bd_disk->queue; -#else - struct request_queue *q = bio->bi_disk->queue; -#endif -#endif - zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); - uint64_t offset = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); - int rw = bio_data_dir(bio); + uint64_t offset = io_offset(bio, rq); + uint64_t size = io_size(bio, rq); + int rw = io_data_dir(bio, rq); + + if (zvol_request_sync) { + force_sync = 1; + } + + zv_request_t zvr = { + .zv = zv, + .bio = bio, + .rq = rq, + }; - if (bio_has_data(bio) && offset + size > zv->zv_volsize) { - printk(KERN_INFO - "%s: bad access: offset=%llu, size=%lu\n", + if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { + printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); - BIO_END_IO(bio, -SET_ERROR(EIO)); + END_IO(zv, bio, rq, -SET_ERROR(EIO)); goto out; } - zv_request_t zvr = { - .zv = zv, - .bio = bio, - }; zv_request_task_t *task; if (rw == WRITE) { - if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - BIO_END_IO(bio, -SET_ERROR(EROFS)); - goto out; - } - /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o @@ -418,7 +564,7 @@ zvol_request(struct request_queue *q, struct bio *bio) * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is - * complete by calling BIO_END_IO() from the taskq callback. + * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling @@ -438,12 +584,12 @@ zvol_request(struct request_queue *q, struct bio *bio) * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, - * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL + * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { - if (zvol_request_sync) { + if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); @@ -451,7 +597,7 @@ zvol_request(struct request_queue *q, struct bio *bio) zvol_discard_task, task, 0, &task->ent); } } else { - if (zvol_request_sync) { + if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); @@ -466,14 +612,14 @@ zvol_request(struct request_queue *q, struct bio *bio) * data and require no additional handling. */ if (size == 0) { - BIO_END_IO(bio, 0); + END_IO(zv, bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ - if (zvol_request_sync) { + if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); @@ -484,8 +630,33 @@ zvol_request(struct request_queue *q, struct bio *bio) out: spl_fstrans_unmark(cookie); -#if (defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ - defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)) && \ +} + +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID +static void +zvol_submit_bio(struct bio *bio) +#else +static blk_qc_t +zvol_submit_bio(struct bio *bio) +#endif +#else +static MAKE_REQUEST_FN_RET +zvol_request(struct request_queue *q, struct bio *bio) +#endif +{ +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#if defined(HAVE_BIO_BDEV_DISK) + struct request_queue *q = bio->bi_bdev->bd_disk->queue; +#else + struct request_queue *q = bio->bi_disk->queue; +#endif +#endif + zvol_state_t *zv = q->queuedata; + + zvol_request_impl(zv, bio, NULL, 0); +#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ + defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif @@ -802,6 +973,27 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } +/* + * Why have two separate block_device_operations structs? + * + * Normally we'd just have one, and assign 'submit_bio' as needed. However, + * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we + * can't just change submit_bio dynamically at runtime. So just create two + * separate structs to get around this. + */ +static const struct block_device_operations zvol_ops_blk_mq = { + .open = zvol_open, + .release = zvol_release, + .ioctl = zvol_ioctl, + .compat_ioctl = zvol_compat_ioctl, + .check_events = zvol_check_events, +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK + .revalidate_disk = zvol_revalidate_disk, +#endif + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +}; + static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, @@ -818,6 +1010,87 @@ static const struct block_device_operations zvol_ops = { #endif }; +static int +zvol_alloc_non_blk_mq(struct zvol_state_os *zso) +{ +#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) +#if defined(HAVE_BLK_ALLOC_DISK) + zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); + if (zso->zvo_disk == NULL) + return (1); + + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; +#else + zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + return (1); + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_BLK_ALLOC_DISK */ +#else + zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + return (1); + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ + return (0); + +} + +static int +zvol_alloc_blk_mq(zvol_state_t *zv) +{ +#ifdef HAVE_BLK_MQ + struct zvol_state_os *zso = zv->zv_zso; + + /* Allocate our blk-mq tag_set */ + if (zvol_blk_mq_alloc_tag_set(zv) != 0) + return (1); + +#if defined(HAVE_BLK_ALLOC_DISK) + zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); + if (zso->zvo_disk == NULL) + return (1); + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; +#else + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + return (1); + } + /* Allocate queue */ + zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); + if (IS_ERR(zso->zvo_queue)) { + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + + /* Our queue is now created, assign it to our disk */ + zso->zvo_disk->queue = zso->zvo_queue; + +#endif + + /* Finish blk-mq init */ + blk_queue_logical_block_size(zso->zvo_queue, 512); +#endif + return (0); +} + /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. @@ -828,6 +1101,7 @@ zvol_alloc(dev_t dev, const char *name) zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; + int ret; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); @@ -846,48 +1120,46 @@ zvol_alloc(dev_t dev, const char *name) list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#ifdef HAVE_BLK_ALLOC_DISK - zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); - if (zso->zvo_disk == NULL) - goto out_kmem; - - zso->zvo_disk->minors = ZVOL_MINORS; - zso->zvo_queue = zso->zvo_disk->queue; +#ifdef HAVE_BLK_MQ + zv->zv_zso->use_blk_mq = zvol_use_blk_mq; #else - zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); - if (zso->zvo_queue == NULL) - goto out_kmem; + zv->zv_zso->use_blk_mq = 0; +#endif - zso->zvo_disk = alloc_disk(ZVOL_MINORS); - if (zso->zvo_disk == NULL) { - blk_cleanup_queue(zso->zvo_queue); - goto out_kmem; + /* + * The block layer has 3 interfaces for getting BIOs: + * + * 1. blk-mq request queues (new) + * 2. submit_bio() (oldest) + * 3. regular request queues (old). + * + * Each of those interfaces has two permutations: + * + * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates + * both the disk and its queue (5.14 kernel or newer) + * + * b) We don't have blk_*alloc_disk(), and have to allocate the + * disk and the queue separately. (5.13 kernel or older) + */ + if (zv->zv_zso->use_blk_mq) { + ret = zvol_alloc_blk_mq(zv); + zso->zvo_disk->fops = &zvol_ops_blk_mq; + } else { + ret = zvol_alloc_non_blk_mq(zso); + zso->zvo_disk->fops = &zvol_ops; } - - zso->zvo_disk->queue = zso->zvo_queue; -#endif /* HAVE_BLK_ALLOC_DISK */ -#else - zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); - if (zso->zvo_queue == NULL) - goto out_kmem; - - zso->zvo_disk = alloc_disk(ZVOL_MINORS); - if (zso->zvo_disk == NULL) { - blk_cleanup_queue(zso->zvo_queue); + if (ret != 0) goto out_kmem; - } - - zso->zvo_disk->queue = zso->zvo_queue; -#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); - /* Disable write merging in favor of the ZIO pipeline. */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + if (!zv->zv_zso->use_blk_mq) { + /* Disable write merging in favor of the ZIO pipeline. */ + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + } /* Enable /proc/diskstats */ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); @@ -920,7 +1192,6 @@ zvol_alloc(dev_t dev, const char *name) #endif } zso->zvo_disk->first_minor = (dev & MINORMASK); - zso->zvo_disk->fops = &zvol_ops; zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); @@ -953,6 +1224,11 @@ zvol_os_free(zvol_state_t *zv) ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); +#ifdef HAVE_BLK_MQ + if (zv->zv_zso->use_blk_mq) + flush_scheduled_work(); +#endif + rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); @@ -965,6 +1241,11 @@ zvol_os_free(zvol_state_t *zv) put_disk(zv->zv_zso->zvo_disk); #endif +#ifdef HAVE_BLK_MQ + if (zv->zv_zso->use_blk_mq) + blk_mq_free_tag_set(&zv->zv_zso->tag_set); +#endif + ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); @@ -1046,8 +1327,69 @@ zvol_os_create_minor(const char *name) blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); - blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + + if (zv->zv_zso->use_blk_mq) { + /* + * IO requests can be really big (1MB). When an IO request + * comes * in, it is passed off to zvol_read() or zvol_write() + * in a new thread, where it is chunked up into 'volblocksize' + * sized pieces and processed. So for example, if the request + * is a 1MB write and your volblocksize is 128k, one zvol_write + * thread will * take that request and sequentially do ten 128k + * IOs. This is due to the fact that the thread needs to lock + * each volblocksize sized block. So you might be wondering: + * "instead of passing the whole 1MB request to one thread, + * why not pass ten individual 128k chunks to ten threads and + * process the whole write in parallel?" The short answer is + * that there's a sweet spot number of chunks that balances + * the greater parallelism with the added overhead of more + * threads. The sweet spot can be different depending on if you + * have a read or write heavy workload. Writes typically want + * high chunk counts while reads typically want lower ones. On + * a test pool with 6 NVMe drives in a 3x 2-disk mirror + * configuration, with volblocksize=8k, the sweet spot for good + * sequential reads and writes was at 8 chunks. + */ + + /* + * Below we tell the kernel how big we want our requests + * to be. You would think that blk_queue_io_opt() would be + * used to do this since it is used to "set optimal request + * size for the queue", but that doesn't seem to do + * anything - the kernel still gives you huge requests + * with tons of little PAGE_SIZE segments contained within it. + * + * Knowing that the kernel will just give you PAGE_SIZE segments + * no matter what, you can say "ok, I want PAGE_SIZE byte + * segments, and I want 'N' of them per request", where N is + * the correct number of segments for the volblocksize and + * number of chunks you want. + */ +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_blocks_per_thread != 0) { + unsigned int chunks; + chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); + + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, + PAGE_SIZE); + blk_queue_max_segments(zv->zv_zso->zvo_queue, + (zv->zv_volblocksize * chunks) / PAGE_SIZE); + } else { + /* + * Special case: zvol_blk_mq_blocks_per_thread = 0 + * Max everything out. + */ + blk_queue_max_segments(zv->zv_zso->zvo_queue, + UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, + UINT_MAX); + } +#endif + } else { + blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + } + blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); @@ -1167,19 +1509,36 @@ int zvol_init(void) { int error; - int threads = MIN(MAX(zvol_threads, 1), 1024); + + if (zvol_threads == 0) { + zvol_actual_threads = num_online_cpus(); + } else { + zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); + } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } - zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, - threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_queue_depth == 0) { + zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; + } else { + zvol_actual_blk_mq_queue_depth = + MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); + } +#endif + /* We're not using blk-mq so setup taskqueues */ + zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, + zvol_actual_threads, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } + zvol_init_impl(); ida_init(&zvol_ida); return (0); @@ -1202,7 +1561,8 @@ module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); -MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); +MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" + "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); @@ -1215,4 +1575,17 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); + +#ifdef HAVE_BLK_MQ +module_param(zvol_blk_mq_queue_depth, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); + +module_param(zvol_use_blk_mq, uint, 0644); +MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); + +module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, + "Process volblocksize blocks per thread"); +#endif + /* END CSTYLED */ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a7ddb146e59b..2b846a6e66fb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -941,6 +941,10 @@ tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] +[tests/functional/zvol/zvol_stress] +tests = ['zvol_stress'] +tags = ['functional', 'zvol', 'zvol_stress'] + [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index b229a161518b..ddf9d349c5ff 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3334,18 +3334,22 @@ function is_te_enabled fi } -# Utility function to determine if a system has multiple cpus. -function is_mp +# Return the number of CPUs (cross-platform) +function get_num_cpus { - if is_linux; then - (($(nproc) > 1)) + if is_linux ; then + nproc elif is_freebsd; then sysctl -n kern.smp.cpus else - (($(psrinfo | wc -l) > 1)) + psrinfo | wc -l fi +} - return $? +# Utility function to determine if a system has multiple cpus. +function is_mp +{ + [[ $(get_num_cpus) -gt 1 ]] } function get_cpu_freq @@ -3888,14 +3892,23 @@ function get_tunable_impl { typeset name="$1" typeset module="${2:-zfs}" + typeset check_only="$3" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) - log_unsupported "Tunable '$name' is unsupported on $(uname)" + if [ -z "$check_only" ] ; then + log_unsupported "Tunable '$name' is unsupported on $(uname)" + else + return 1 + fi ;; "") - log_fail "Tunable '$name' must be added to tunables.cfg" + if [ -z "$check_only" ] ; then + log_fail "Tunable '$name' must be added to tunables.cfg" + else + return 1 + fi ;; *) ;; @@ -3919,6 +3932,14 @@ function get_tunable_impl return 1 } +# Does a tunable exist? +# +# $1: Tunable name +function tunable_exists +{ + get_tunable_impl $1 "zfs" 1 +} + # # Prints the current time in seconds since UNIX Epoch. # diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index eea2af2edcf0..fcacf519ce44 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -87,6 +87,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED +VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/functional/zvol/Makefile.am b/tests/zfs-tests/tests/functional/zvol/Makefile.am index e4910754bb81..9089a939abb0 100644 --- a/tests/zfs-tests/tests/functional/zvol/Makefile.am +++ b/tests/zfs-tests/tests/functional/zvol/Makefile.am @@ -5,6 +5,7 @@ dist_pkgdata_DATA = \ SUBDIRS = \ zvol_ENOSPC \ + zvol_stress \ zvol_cli \ zvol_misc \ zvol_swap diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am b/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am new file mode 100644 index 000000000000..5ccd0c7b5619 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am @@ -0,0 +1,5 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/zvol/zvol_stress +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh \ + zvol_stress.ksh diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh new file mode 100755 index 000000000000..b81a372638e3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh new file mode 100755 index 000000000000..746ac307a755 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh @@ -0,0 +1,38 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_setup "$DISKS" + +log_pass diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh new file mode 100755 index 000000000000..94d3717c42af --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -0,0 +1,171 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2022 by Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/reservation/reservation.shlib + +# +# DESCRIPTION: +# Stress test multithreaded transfers to multiple zvols. Also verify +# zvol errors show up in zpool status. +# +# STRATEGY: +# +# For both the normal submit_bio() codepath and the blk-mq codepath, do +# the following: +# +# 1. Create one zvol per CPU +# 2. In parallel, spawn an fio "write and verify" for each zvol +# 3. Inject write errors +# 4. Write to one of the zvols with dd and verify the errors +# + +verify_runnable "global" + +num_zvols=$(get_num_cpus) + +# If we were making one big zvol from all the pool space, it would +# be this big: +biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL) + +# Crude calculation: take the biggest zvol size we could possibly +# create, knock 10% off it (for overhead) and divide by the number +# of ZVOLs we want to make. +each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9) / $num_zvols )) + +typeset tmpdir="$(mktemp -d zvol_stress_fio_state.XXXXXX)" + +function create_zvols +{ + log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each" + for i in $(seq $num_zvols) ; do + log_must zfs create -V $each_zvol_size $TESTPOOL/testvol$i + block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/testvol$i" + done +} + +function destroy_zvols +{ + for i in $(seq $num_zvols) ; do + log_must_busy zfs destroy $TESTPOOL/testvol$i + done +} + +# enable/disable blk-mq (if available) +# +# $1: 1 = enable, 0 = disable +function set_blk_mq +{ + # Not all kernels support blk-mq + if tunable_exists VOL_USE_BLK_MQ ; then + log_must set_tunable32 VOL_USE_BLK_MQ $1 + fi +} + +function do_zvol_stress +{ + # Write 10% of each zvol, or 50MB, whichever is less + zvol_write_size=$((each_zvol_size / 10)) + if [ $zvol_write_size -gt $((50 * 1048576)) ] ; then + zvol_write_size=$((50 * 1048576)) + fi + zvol_write_size_mb=$(($zvol_write_size / 1048576)) + + if is_linux ; then + engine=libaio + else + engine=psync + fi + + # Spawn off one fio per zvol in parallel + pids="" + for i in $(seq $num_zvols) ; do + # Spawn one fio per zvol as its own process + fio --ioengine=$engine --name=zvol_stress$i --direct=0 \ + --filename="$ZVOL_DEVDIR/$TESTPOOL/testvol$i" --bs=1048576 \ + --iodepth=10 --readwrite=randwrite --size=${zvol_write_size} \ + --verify_async=2 --numjobs=1 --verify=sha1 \ + --verify_fatal=1 \ + --continue_on_error=none \ + --error_dump=1 \ + --exitall_on_error \ + --aux-path="$tmpdir" --do_verify=1 & + pids="$pids $!" + done + + # Wait for all the spawned fios to finish and look for errors + fail="" + i=0 + for pid in $pids ; do + log_note "$s waiting on $pid" + if ! wait $pid ; then + log_fail "fio error on $TESTPOOL/testvol$i" + fi + i=$(($i + 1)) + done +} + +function cleanup +{ + log_must zinject -c all + log_must zpool clear $TESTPOOL + destroy_zvols + set_blk_mq 0 + + # Remove all fio's leftover state files + if [ -n "$tmpdir" ] ; then + rm -f "$tmpdir"/*.state + rmdir "$tmpdir" + fi +} + +log_onexit cleanup + +log_assert "Stress test zvols" + +set_blk_mq 0 +create_zvols +# Do some fio write/verifies in parallel +do_zvol_stress +destroy_zvols + +# Enable blk-mq (block multi-queue), and re-run the same test +set_blk_mq 1 +create_zvols +do_zvol_stress + +# Inject some errors, and verify we see some IO errors in zpool status +for DISK in $DISKS ; do + log_must zinject -d $DISK -f 10 -e io -T write $TESTPOOL +done +log_must dd if=/dev/zero of=$ZVOL_DEVDIR/$TESTPOOL/testvol1 bs=512 count=50 +log_must zinject -c all + +log_must zpool status +write_errors=$(zpool status -pv | grep $DISK | awk '{print $4}') +if [ $write_errors -le 0 ] ; then + log_fail "Expected to see some write errors (saw $write_errors)" +else + log_note "Correctly saw $write_errors write errors" +fi +log_pass "Done with zvol_stress"