Skip to content

Commit

Permalink
zvol: Support blk-mq for better performance
Browse files Browse the repository at this point in the history
Add support for the kernel's block multiqueue (blk-mq) interface in
the zvol block driver.  blk-mq creates multiple request queues on
different CPUs rather than having a single request queue.  This can
improve zvol performance with multithreaded reads/writes.

This implementation uses the blk-mq interfaces on 4.13 or newer
kernels.  Building against older kernels will fall back to the
older BIO interfaces.

Note that you must set the `zvol_use_blk_mq` module param to
enable the blk-mq API.  It is disabled by default.

In addition, this commit lets the zvol blk-mq layer process whole
`struct request` IOs at a time, rather than breaking them down
into their individual BIOs.  This reduces dbuf lock contention
and overhead versus the legacy zvol submit_bio() codepath.

	sequential dd to one zvol, 8k volblocksize, no O_DIRECT:

	legacy submit_bio()     292MB/s write  453MB/s read
	this commit             453MB/s write  885MB/s read

It also introduces a new `zvol_blk_mq_chunks_per_thread` module
parameter. This parameter represents how many volblocksize'd chunks
to process per each zvol thread.  It can be used to tune your zvols
for better read vs write performance (higher values favor write,
lower favor read).

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Issue #12483
  • Loading branch information
tonyhutter committed Mar 8, 2022
1 parent a5b3fab commit a7e5a62
Show file tree
Hide file tree
Showing 21 changed files with 1,368 additions and 144 deletions.
32 changes: 32 additions & 0 deletions config/kernel-blk-queue.m4
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
])
])

dnl #
dnl # See if kernel supports block multi-queue and blk_status_t.
dnl # blk_status_t represents the new status codes introduced in the 4.13
dnl # kernel patch:
dnl #
dnl # block: introduce new block status code type
dnl #
dnl # We do not currently support the "old" block multi-queue interfaces from
dnl # prior kernels.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [
ZFS_LINUX_TEST_SRC([blk_mq], [
#include <linux/blk-mq.h>
], [
struct blk_mq_tag_set tag_set = {0};
(void) blk_mq_alloc_tag_set(&tag_set);
return BLK_STS_OK;
], [$NO_UNUSED_BUT_SET_VARIABLE])
])

AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available])
ZFS_LINUX_TEST_RESULT([blk_mq], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available])
], [
AC_MSG_RESULT(no)
])
])

AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
Expand All @@ -326,6 +356,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH
ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS
ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS
ZFS_AC_KERNEL_SRC_BLK_MQ
])

AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
Expand All @@ -339,4 +370,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS
ZFS_AC_KERNEL_BLK_MQ
])
1 change: 1 addition & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/zpool_influxdb/Makefile
tests/zfs-tests/tests/functional/zvol/Makefile
tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile
tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile
tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile
tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile
tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile
Expand Down
91 changes: 91 additions & 0 deletions include/os/linux/kernel/linux/blkdev_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
#include <linux/hdreg.h>
#include <linux/major.h>
#include <linux/msdos_fs.h> /* for SECTOR_* */
#include <linux/bio.h>

#ifdef HAVE_BLK_MQ
#include <linux/blk-mq.h>
#endif

#ifndef HAVE_BLK_QUEUE_FLAG_SET
static inline void
Expand Down Expand Up @@ -579,4 +584,90 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id)
}
#endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */

static inline int
io_data_dir(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL) {
enum req_opf op = req_op(rq);
if (op_is_write(op)) {
return (WRITE);
} else {
return (READ);
}
}
#endif
return (bio_data_dir(bio));
}

static inline int
io_is_flush(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (req_op(rq) == REQ_OP_FLUSH);
#endif
return (bio_is_flush(bio));
}

static inline int
io_is_discard(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (req_op(rq) == REQ_OP_DISCARD);
#endif
return (bio_is_discard(bio));
}

static inline int
io_is_secure_erase(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (req_op(rq) == REQ_OP_SECURE_ERASE);
#endif
return (bio_is_secure_erase(bio));
}

static inline int
io_is_fua(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (rq->cmd_flags & REQ_FUA);
#endif
return (bio_is_fua(bio));
}


static inline uint64_t
io_offset(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (blk_rq_pos(rq) << 9);
#endif
return (BIO_BI_SECTOR(bio) << 9);
}

static inline uint64_t
io_size(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (blk_rq_bytes(rq));
#endif
return (BIO_BI_SIZE(bio));
}

static inline int
io_has_data(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (bio_has_data(rq->bio));
#endif
return (bio_has_data(bio));
}
#endif /* _ZFS_BLKDEV_H */
37 changes: 30 additions & 7 deletions include/os/linux/spl/sys/uio.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,20 @@ typedef struct zfs_uio {
boolean_t uio_fault_disable;
uint16_t uio_fmode;
uint16_t uio_extflg;
ssize_t uio_resid;
ssize_t uio_resid;
size_t uio_skip;

struct request *rq;

/*
* Used for saving rq_for_each_segment() state between calls
* to zfs_uiomove_bvec_rq().
*/
struct req_iterator iter;
struct bio_vec bv;
} zfs_uio_t;


#define zfs_uio_segflg(u) (u)->uio_segflg
#define zfs_uio_offset(u) (u)->uio_loffset
#define zfs_uio_resid(u) (u)->uio_resid
Expand Down Expand Up @@ -116,17 +126,30 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov,
}

static inline void
zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio)
zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
{
uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
if (bio) {
uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
} else {
uio->uio_bvec = NULL;
uio->uio_iovcnt = 0;
memset(&uio->iter, 0, sizeof (uio->iter));
}

uio->uio_loffset = io_offset(bio, rq);
uio->uio_segflg = UIO_BVEC;
uio->uio_fault_disable = B_FALSE;
uio->uio_fmode = 0;
uio->uio_extflg = 0;
uio->uio_resid = BIO_BI_SIZE(bio);
uio->uio_skip = BIO_BI_SKIP(bio);
uio->uio_resid = io_size(bio, rq);
if (bio) {
uio->uio_skip = BIO_BI_SKIP(bio);
} else {
uio->uio_skip = 0;
}

uio->rq = rq;
}

#if defined(HAVE_VFS_IOV_ITER)
Expand Down
63 changes: 61 additions & 2 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2207,9 +2207,68 @@ for each I/O submitter.
When unset, requests are handled asynchronously by a thread pool.
The number of requests which can be handled concurrently is controlled by
.Sy zvol_threads .
.Sy zvol_request_sync
is ignored when running on a kernel that supports block multiqueue
.Pq Li blk-mq .
.
.It Sy zvol_threads Ns = Ns Sy 0 Pq uint
The number of threads to use for processing zvol block IOs.
On older
.No non- Ns Li blk-mq
kernels,
.Sy zvol_threads
is the total number of threads to use for all zvols.
On kernels that support
.Li blk-mq
.Sy zvol_threads
is also the number of queues per zvol.
If
.Sy 0
(the default) then internally set
.Sy zvol_threads
to the number of CPUs present.
.
.It Sy zvol_threads Ns = Ns Sy 32 Pq uint
Max number of threads which can handle zvol I/O requests concurrently.
.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Set to
.Sy 1
to use the
.Li blk-mq
API for zvols.
Set to
.Sy 0
(the default) to use the legacy zvol APIs.
This setting can give better or worse zvol performance depending on
the workload.
This parameter will only appear if your kernel supports
.Li blk-mq
and is only read and assigned to a zvol at zvol load time.
.
.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
If
.Sy zvol_use_blk_mq
is enabled, then process this number of volblocksize blocks per zvol thread.
This tunable can be use to favor better performance for zvol reads (lower
values) or writes (higher values).
If set to 0, then the zvol layer will process the maximum number of blocks
per thread that it can.
This parameter will only appear if your kernel supports
.Li blk-mq
and is only read and assigned to a zvol at zvol load time.
.
.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
The queue_depth value for the zvol
.Li blk-mq
interface.
This parameter will only appear if your kernel supports
.Li blk-mq
and is only read at zvol load time.
If
.Sy 0
(the default) then use the kernel's default queue depth.
If you set
.Sy zvol_blk_mq_queue_depth
lower than the kernel's minimum queue depth, it will be internally
capped to the kernel's minimum queue depth.
.
.It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
Defines zvol block devices behaviour when
Expand Down
Loading

0 comments on commit a7e5a62

Please sign in to comment.