Skip to content

Commit

Permalink
zvol: Support blk-mq for better performance
Browse files Browse the repository at this point in the history
Add support for the kernel's block multiqueue (blk-mq) interface in
the zvol block driver.  blk-mq creates multiple request queues on
different CPUs rather than having a single request queue.  This can
improve zvol performance with multithreaded reads/writes.

This implementation uses the blk-mq interfaces on 4.13 or newer
kernels.  Building against older kernels will fall back to the
older BIO interfaces.

Note that you must set the `zvol_use_blk_mq` module param to
enable the blk-mq API.  It is disabled by default.

In addition, this commit lets the zvol blk-mq layer process whole
`struct request` IOs at a time, rather than breaking them down
into their individual BIOs.  This reduces dbuf lock contention
and overhead versus the legacy zvol submit_bio() codepath.

	sequential dd to one zvol, 8k volblocksize, no O_DIRECT:

	legacy submit_bio()     292MB/s write  453MB/s read
	this commit             453MB/s write  885MB/s read

It also introduces a new `zvol_blk_mq_chunks_per_thread` module
parameter. This parameter represents how many volblocksize'd chunks
to process per each zvol thread.  It can be used to tune your zvols
for better read vs write performance (higher values favor write,
lower favor read).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes openzfs#13148
Issue openzfs#12483
  • Loading branch information
tonyhutter authored and andrewc12 committed Sep 23, 2022
1 parent 399467f commit e95e4b3
Show file tree
Hide file tree
Showing 18 changed files with 1,437 additions and 148 deletions.
32 changes: 32 additions & 0 deletions config/kernel-blk-queue.m4
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
])
])

dnl #
dnl # See if kernel supports block multi-queue and blk_status_t.
dnl # blk_status_t represents the new status codes introduced in the 4.13
dnl # kernel patch:
dnl #
dnl # block: introduce new block status code type
dnl #
dnl # We do not currently support the "old" block multi-queue interfaces from
dnl # prior kernels.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [
ZFS_LINUX_TEST_SRC([blk_mq], [
#include <linux/blk-mq.h>
], [
struct blk_mq_tag_set tag_set __attribute__ ((unused)) = {0};
(void) blk_mq_alloc_tag_set(&tag_set);
return BLK_STS_OK;
], [])
])

AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available])
ZFS_LINUX_TEST_RESULT([blk_mq], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available])
], [
AC_MSG_RESULT(no)
])
])

AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
Expand All @@ -370,6 +400,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH
ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS
ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS
ZFS_AC_KERNEL_SRC_BLK_MQ
])

AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
Expand All @@ -383,4 +414,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS
ZFS_AC_KERNEL_BLK_MQ
])
111 changes: 111 additions & 0 deletions include/os/linux/kernel/linux/blkdev_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
#include <linux/hdreg.h>
#include <linux/major.h>
#include <linux/msdos_fs.h> /* for SECTOR_* */
#include <linux/bio.h>

#ifdef HAVE_BLK_MQ
#include <linux/blk-mq.h>
#endif

#ifndef HAVE_BLK_QUEUE_FLAG_SET
static inline void
Expand Down Expand Up @@ -608,4 +613,110 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id)
}
#endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */

/*
* All the io_*() helper functions below can operate on a bio, or a rq, but
* not both. The older submit_bio() codepath will pass a bio, and the
* newer blk-mq codepath will pass a rq.
*/
static inline int
io_data_dir(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL) {
if (op_is_write(req_op(rq))) {
return (WRITE);
} else {
return (READ);
}
}
#else
ASSERT3P(rq, ==, NULL);
#endif
return (bio_data_dir(bio));
}

static inline int
io_is_flush(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (req_op(rq) == REQ_OP_FLUSH);
#else
ASSERT3P(rq, ==, NULL);
#endif
return (bio_is_flush(bio));
}

static inline int
io_is_discard(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (req_op(rq) == REQ_OP_DISCARD);
#else
ASSERT3P(rq, ==, NULL);
#endif
return (bio_is_discard(bio));
}

static inline int
io_is_secure_erase(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (req_op(rq) == REQ_OP_SECURE_ERASE);
#else
ASSERT3P(rq, ==, NULL);
#endif
return (bio_is_secure_erase(bio));
}

static inline int
io_is_fua(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (rq->cmd_flags & REQ_FUA);
#else
ASSERT3P(rq, ==, NULL);
#endif
return (bio_is_fua(bio));
}


static inline uint64_t
io_offset(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (blk_rq_pos(rq) << 9);
#else
ASSERT3P(rq, ==, NULL);
#endif
return (BIO_BI_SECTOR(bio) << 9);
}

static inline uint64_t
io_size(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (blk_rq_bytes(rq));
#else
ASSERT3P(rq, ==, NULL);
#endif
return (BIO_BI_SIZE(bio));
}

static inline int
io_has_data(struct bio *bio, struct request *rq)
{
#ifdef HAVE_BLK_MQ
if (rq != NULL)
return (bio_has_data(rq->bio));
#else
ASSERT3P(rq, ==, NULL);
#endif
return (bio_has_data(bio));
}
#endif /* _ZFS_BLKDEV_H */
39 changes: 33 additions & 6 deletions include/os/linux/spl/sys/uio.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,20 @@ typedef struct zfs_uio {
uint16_t uio_fmode;
uint16_t uio_extflg;
ssize_t uio_resid;

size_t uio_skip;

struct request *rq;

/*
* Used for saving rq_for_each_segment() state between calls
* to zfs_uiomove_bvec_rq().
*/
struct req_iterator iter;
struct bio_vec bv;
} zfs_uio_t;


#define zfs_uio_segflg(u) (u)->uio_segflg
#define zfs_uio_offset(u) (u)->uio_loffset
#define zfs_uio_resid(u) (u)->uio_resid
Expand Down Expand Up @@ -116,17 +127,33 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov,
}

static inline void
zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio)
zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
{
uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
/* Either bio or rq will be set, but not both */
ASSERT3P(uio, !=, bio);

if (bio) {
uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
} else {
uio->uio_bvec = NULL;
uio->uio_iovcnt = 0;
memset(&uio->iter, 0, sizeof (uio->iter));
}

uio->uio_loffset = io_offset(bio, rq);
uio->uio_segflg = UIO_BVEC;
uio->uio_fault_disable = B_FALSE;
uio->uio_fmode = 0;
uio->uio_extflg = 0;
uio->uio_resid = BIO_BI_SIZE(bio);
uio->uio_skip = BIO_BI_SKIP(bio);
uio->uio_resid = io_size(bio, rq);
if (bio) {
uio->uio_skip = BIO_BI_SKIP(bio);
} else {
uio->uio_skip = 0;
}

uio->rq = rq;
}

#if defined(HAVE_VFS_IOV_ITER)
Expand Down
69 changes: 67 additions & 2 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2248,9 +2248,74 @@ for each I/O submitter.
When unset, requests are handled asynchronously by a thread pool.
The number of requests which can be handled concurrently is controlled by
.Sy zvol_threads .
.Sy zvol_request_sync
is ignored when running on a kernel that supports block multiqueue
.Pq Li blk-mq .
.
.It Sy zvol_threads Ns = Ns Sy 32 Pq uint
Max number of threads which can handle zvol I/O requests concurrently.
.It Sy zvol_threads Ns = Ns Sy 0 Pq uint
The number of system wide threads to use for processing zvol block IOs.
If
.Sy 0
(the default) then internally set
.Sy zvol_threads
to the number of CPUs present or 32 (whichever is greater).
.
.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
The number of threads per zvol to use for queuing IO requests.
This parameter will only appear if your kernel supports
.Li blk-mq
and is only read and assigned to a zvol at zvol load time.
If
.Sy 0
(the default) then internally set
.Sy zvol_blk_mq_threads
to the number of CPUs present.
.
.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Set to
.Sy 1
to use the
.Li blk-mq
API for zvols.
Set to
.Sy 0
(the default) to use the legacy zvol APIs.
This setting can give better or worse zvol performance depending on
the workload.
This parameter will only appear if your kernel supports
.Li blk-mq
and is only read and assigned to a zvol at zvol load time.
.
.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
If
.Sy zvol_use_blk_mq
is enabled, then process this number of
.Sy volblocksize Ns -sized blocks per zvol thread.
This tunable can be use to favor better performance for zvol reads (lower
values) or writes (higher values).
If set to
.Sy 0 ,
then the zvol layer will process the maximum number of blocks
per thread that it can.
This parameter will only appear if your kernel supports
.Li blk-mq
and is only applied at each zvol's load time.
.
.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
The queue_depth value for the zvol
.Li blk-mq
interface.
This parameter will only appear if your kernel supports
.Li blk-mq
and is only applied at each zvol's load time.
If
.Sy 0
(the default) then use the kernel's default queue depth.
Values are clamped to the kernel's
.Dv BLKDEV_MIN_RQ
and
.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
limits.
.
.It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
Defines zvol block devices behaviour when
Expand Down
Loading

0 comments on commit e95e4b3

Please sign in to comment.