Skip to content

Commit

Permalink
Improve ZFS N-way mirror read performance by using load and locality
Browse files Browse the repository at this point in the history
information.

The existing algorithm selects a preferred leaf vdev based on offset of the zio
request modulo the number of members in the mirror. It assumes the devices are
of equal performance and that spreading the requests randomly over both drives
will be sufficient to saturate them. In practice this results in the leaf vdevs
being under utilized.

The new algorithm takes into the following additional factors:
* Load of the vdevs (number outstanding I/O requests)
* The locality of last queued I/O vs the new I/O request.

Within the locality calculation additional knowledge about the underlying vdev
is considered such as; is the device backing the vdev a rotating media device.

This results in performance increases across the board as well as significant
increases for predominantly streaming loads and for configurations which don't
have evenly performing devices.

The following are results from a setup with 3 Way Mirror with 2 x HD's and
1 x SSD from a basic test running multiple parrallel dd's.

With pre-fetch disabled (vfs.zfs.prefetch_disable=1):

== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 161 seconds @ 95 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 297 seconds @ 51 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 54 seconds @ 284 MB/s

With pre-fetch enabled (vfs.zfs.prefetch_disable=0):

== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 91 seconds @ 168 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 108 seconds @ 142 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 48 seconds @ 320 MB/s

In addition to the performance changes the code was also restructured, with
the help of Justin Gibbs, to provide a more logical flow which also ensures
vdevs loads are only calculated from the set of valid candidates.

The following additional sysctls where added to allow the administrator
to tune the behaviour of the load algorithm:
* vfs.zfs.vdev.mirror.rotating_inc
* vfs.zfs.vdev.mirror.rotating_seek_inc
* vfs.zfs.vdev.mirror.rotating_seek_offset
* vfs.zfs.vdev.mirror.non_rotating_inc
* vfs.zfs.vdev.mirror.non_rotating_seek_inc

These changes where based on work started by the zfsonlinux developers:
openzfs/zfs#1487

Reviewed by:	gibbs, mav, will
MFC after:	2 weeks
Sponsored by:	Multiplay

(cherry picked from commit 5c7a6f5)
  • Loading branch information
smh authored and delphij committed Feb 5, 2014
1 parent 9dfc052 commit f0962d7
Show file tree
Hide file tree
Showing 13 changed files with 343 additions and 73 deletions.
3 changes: 2 additions & 1 deletion sys/cam/ata/ata_da.c
Original file line number Diff line number Diff line change
Expand Up @@ -1227,12 +1227,13 @@ adaregister(struct cam_periph *periph, void *arg)
"kern.cam.ada.%d.write_cache", periph->unit_number);
TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);
/* Disable queue sorting for non-rotational media by default. */
if (cgd->ident_data.media_rotation_rate == 1)
if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING)
softc->sort_io_queue = 0;
else
softc->sort_io_queue = -1;
adagetparams(periph, cgd);
softc->disk = disk_alloc();
softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
periph->unit_number, softc->params.secsize,
DEVSTAT_ALL_SUPPORTED,
Expand Down
2 changes: 1 addition & 1 deletion sys/cam/scsi/scsi_all.h
Original file line number Diff line number Diff line change
Expand Up @@ -1451,7 +1451,7 @@ struct scsi_vpd_block_characteristics
u_int8_t page_length[2];
u_int8_t medium_rotation_rate[2];
#define SVPD_BDC_RATE_NOT_REPORTED 0x00
#define SVPD_BDC_RATE_NONE_ROTATING 0x01
#define SVPD_BDC_RATE_NON_ROTATING 0x01
u_int8_t reserved1;
u_int8_t nominal_form_factor;
#define SVPD_BDC_FORM_NOT_REPORTED 0x00
Expand Down
27 changes: 24 additions & 3 deletions sys/cam/scsi/scsi_da.c
Original file line number Diff line number Diff line change
Expand Up @@ -3370,9 +3370,18 @@ dadone(struct cam_periph *periph, union ccb *done_ccb)
* Disable queue sorting for non-rotational media
* by default.
*/
if (scsi_2btoul(bdc->medium_rotation_rate) ==
SVPD_BDC_RATE_NONE_ROTATING)
u_int old_rate = softc->disk->d_rotation_rate;

softc->disk->d_rotation_rate =
scsi_2btoul(bdc->medium_rotation_rate);
if (softc->disk->d_rotation_rate ==
SVPD_BDC_RATE_NON_ROTATING) {
softc->sort_io_queue = 0;
}
if (softc->disk->d_rotation_rate != old_rate) {
disk_attr_changed(softc->disk,
"GEOM::rotation_rate", M_NOWAIT);
}
} else {
int error;
error = daerror(done_ccb, CAM_RETRY_SELTO,
Expand Down Expand Up @@ -3407,6 +3416,8 @@ dadone(struct cam_periph *periph, union ccb *done_ccb)
ptr = (uint16_t *)ata_params;

if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
uint16_t old_rate;

for (i = 0; i < sizeof(*ata_params) / 2; i++)
ptr[i] = le16toh(ptr[i]);
if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM) {
Expand All @@ -3421,8 +3432,18 @@ dadone(struct cam_periph *periph, union ccb *done_ccb)
* Disable queue sorting for non-rotational media
* by default.
*/
if (ata_params->media_rotation_rate == 1)
old_rate = softc->disk->d_rotation_rate;
softc->disk->d_rotation_rate =
ata_params->media_rotation_rate;
if (softc->disk->d_rotation_rate ==
ATA_RATE_NON_ROTATING) {
softc->sort_io_queue = 0;
}

if (softc->disk->d_rotation_rate != old_rate) {
disk_attr_changed(softc->disk,
"GEOM::rotation_rate", M_NOWAIT);
}
} else {
int error;
error = daerror(done_ccb, CAM_RETRY_SELTO,
Expand Down
3 changes: 3 additions & 0 deletions sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ extern void vdev_queue_init(vdev_t *vd);
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern int vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);

extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ struct vdev_queue {
uint64_t vq_last_offset;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
kmutex_t vq_lock;
uint64_t vq_lastoffset;
};

/*
Expand Down Expand Up @@ -227,7 +228,10 @@ struct vdev {
spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */
struct trim_map *vdev_trimmap;
struct trim_map *vdev_trimmap; /* map on outstanding trims */
uint16_t vdev_rotation_rate; /* rotational rate of the media */
#define VDEV_RATE_UNKNOWN 0
#define VDEV_RATE_NON_ROTATING 1

/*
* For DTrace to work in userland (libzpool) context, these fields must
Expand Down
35 changes: 35 additions & 0 deletions sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@
* Virtual device vector for GEOM.
*/

static g_attrchanged_t vdev_geom_attrchanged;
struct g_class zfs_vdev_class = {
.name = "ZFS::VDEV",
.version = G_VERSION,
.attrchanged = vdev_geom_attrchanged,
};

DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
Expand Down Expand Up @@ -72,6 +74,34 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, larger_ashift_minimal, CTLFLAG_RW,

#define ZFSVOL_CLASS_NAME "ZFS::ZVOL"

static void
vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
{
int error;
uint16_t rate;

error = g_getattr("GEOM::rotation_rate", cp, &rate);
if (error == 0)
vd->vdev_rotation_rate = rate;
else
vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
}

static void
vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
{
vdev_t *vd;

vd = cp->private;
if (vd == NULL)
return;

if (strcmp(attr, "GEOM::rotation_rate") == 0) {
vdev_geom_set_rotation_rate(vd, cp);
return;
}
}

static void
vdev_geom_orphan(struct g_consumer *cp)
{
Expand Down Expand Up @@ -702,6 +732,11 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);

/*
* Determine the device's rotation rate.
*/
vdev_geom_set_rotation_rate(vd, cp);

return (0);
}

Expand Down
Loading

0 comments on commit f0962d7

Please sign in to comment.