From 25dcbd5cbfacfd7a6452ea09a9f33e7e2e18a235 Mon Sep 17 00:00:00 2001 From: smh Date: Wed, 23 Oct 2013 09:54:58 +0000 Subject: [PATCH] Improve ZFS N-way mirror read performance by using load and locality information. The existing algorithm selects a preferred leaf vdev based on offset of the zio request modulo the number of members in the mirror. It assumes the devices are of equal performance and that spreading the requests randomly over both drives will be sufficient to saturate them. In practice this results in the leaf vdevs being under utilized. The new algorithm takes into the following additional factors: * Load of the vdevs (number outstanding I/O requests) * The locality of last queued I/O vs the new I/O request. Within the locality calculation additional knowledge about the underlying vdev is considered such as; is the device backing the vdev a rotating media device. This results in performance increases across the board as well as significant increases for predominantly streaming loads and for configurations which don't have evenly performing devices. The following are results from a setup with 3 Way Mirror with 2 x HD's and 1 x SSD from a basic test running multiple parrallel dd's. With pre-fetch disabled (vfs.zfs.prefetch_disable=1): == Stripe Balanced (default) == Read 15360MB using bs: 1048576, readers: 3, took 161 seconds @ 95 MB/s == Load Balanced (zfslinux) == Read 15360MB using bs: 1048576, readers: 3, took 297 seconds @ 51 MB/s == Load Balanced (locality freebsd) == Read 15360MB using bs: 1048576, readers: 3, took 54 seconds @ 284 MB/s With pre-fetch enabled (vfs.zfs.prefetch_disable=0): == Stripe Balanced (default) == Read 15360MB using bs: 1048576, readers: 3, took 91 seconds @ 168 MB/s == Load Balanced (zfslinux) == Read 15360MB using bs: 1048576, readers: 3, took 108 seconds @ 142 MB/s == Load Balanced (locality freebsd) == Read 15360MB using bs: 1048576, readers: 3, took 48 seconds @ 320 MB/s In addition to the performance changes the code was also restructured, with the help of Justin Gibbs, to provide a more logical flow which also ensures vdevs loads are only calculated from the set of valid candidates. The following additional sysctls where added to allow the administrator to tune the behaviour of the load algorithm: * vfs.zfs.vdev.mirror.rotating_inc * vfs.zfs.vdev.mirror.rotating_seek_inc * vfs.zfs.vdev.mirror.rotating_seek_offset * vfs.zfs.vdev.mirror.non_rotating_inc * vfs.zfs.vdev.mirror.non_rotating_seek_inc These changes where based on work started by the zfsonlinux developers: https://github.com/zfsonlinux/zfs/pull/1487 Reviewed by: gibbs, mav, will MFC after: 2 weeks Sponsored by: Multiplay (cherry picked from commit 5c7a6f5d929fa744b3a959b187d32c752a5ea7d5) --- sys/cam/ata/ata_da.c | 3 +- sys/cam/scsi/scsi_all.h | 2 +- sys/cam/scsi/scsi_da.c | 27 +- .../opensolaris/uts/common/fs/zfs/sys/vdev.h | 3 + .../uts/common/fs/zfs/sys/vdev_impl.h | 6 +- .../opensolaris/uts/common/fs/zfs/vdev_geom.c | 35 +++ .../uts/common/fs/zfs/vdev_mirror.c | 282 ++++++++++++++---- .../uts/common/fs/zfs/vdev_queue.c | 25 ++ sys/geom/geom.h | 1 + sys/geom/geom_disk.c | 19 +- sys/geom/geom_disk.h | 4 +- sys/geom/geom_subr.c | 7 + sys/sys/ata.h | 2 + 13 files changed, 343 insertions(+), 73 deletions(-) diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index 1e00c16268d60..3f5cd6fe7ebf6 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -1259,12 +1259,13 @@ adaregister(struct cam_periph *periph, void *arg) "kern.cam.ada.%d.write_cache", periph->unit_number); TUNABLE_INT_FETCH(announce_buf, &softc->write_cache); /* Disable queue sorting for non-rotational media by default. */ - if (cgd->ident_data.media_rotation_rate == 1) + if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) softc->sort_io_queue = 0; else softc->sort_io_queue = -1; adagetparams(periph, cgd); softc->disk = disk_alloc(); + softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate; softc->disk->d_devstat = devstat_new_entry(periph->periph_name, periph->unit_number, softc->params.secsize, DEVSTAT_ALL_SUPPORTED, diff --git a/sys/cam/scsi/scsi_all.h b/sys/cam/scsi/scsi_all.h index cf84d396ae44a..8ba309423f3c4 100644 --- a/sys/cam/scsi/scsi_all.h +++ b/sys/cam/scsi/scsi_all.h @@ -1451,7 +1451,7 @@ struct scsi_vpd_block_characteristics u_int8_t page_length[2]; u_int8_t medium_rotation_rate[2]; #define SVPD_BDC_RATE_NOT_REPORTED 0x00 -#define SVPD_BDC_RATE_NONE_ROTATING 0x01 +#define SVPD_BDC_RATE_NON_ROTATING 0x01 u_int8_t reserved1; u_int8_t nominal_form_factor; #define SVPD_BDC_FORM_NOT_REPORTED 0x00 diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index a35ea375fe7be..0703dfd064d3d 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -3429,9 +3429,18 @@ dadone(struct cam_periph *periph, union ccb *done_ccb) * Disable queue sorting for non-rotational media * by default. */ - if (scsi_2btoul(bdc->medium_rotation_rate) == - SVPD_BDC_RATE_NONE_ROTATING) + u_int old_rate = softc->disk->d_rotation_rate; + + softc->disk->d_rotation_rate = + scsi_2btoul(bdc->medium_rotation_rate); + if (softc->disk->d_rotation_rate == + SVPD_BDC_RATE_NON_ROTATING) { softc->sort_io_queue = 0; + } + if (softc->disk->d_rotation_rate != old_rate) { + disk_attr_changed(softc->disk, + "GEOM::rotation_rate", M_NOWAIT); + } } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, @@ -3466,6 +3475,8 @@ dadone(struct cam_periph *periph, union ccb *done_ccb) ptr = (uint16_t *)ata_params; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { + uint16_t old_rate; + for (i = 0; i < sizeof(*ata_params) / 2; i++) ptr[i] = le16toh(ptr[i]); if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM && @@ -3481,8 +3492,18 @@ dadone(struct cam_periph *periph, union ccb *done_ccb) * Disable queue sorting for non-rotational media * by default. */ - if (ata_params->media_rotation_rate == 1) + old_rate = softc->disk->d_rotation_rate; + softc->disk->d_rotation_rate = + ata_params->media_rotation_rate; + if (softc->disk->d_rotation_rate == + ATA_RATE_NON_ROTATING) { softc->sort_io_queue = 0; + } + + if (softc->disk->d_rotation_rate != old_rate) { + disk_attr_changed(softc->disk, + "GEOM::rotation_rate", M_NOWAIT); + } } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index 15bd764615a34..2cbcf9c2b2887 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -120,6 +120,9 @@ extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); +extern int vdev_queue_length(vdev_t *vd); +extern uint64_t vdev_queue_lastoffset(vdev_t *vd); +extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 518ebc46e9fd8..768d00e7236fa 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -116,6 +116,7 @@ struct vdev_queue { uint64_t vq_last_offset; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; + uint64_t vq_lastoffset; }; /* @@ -227,7 +228,10 @@ struct vdev { spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ - struct trim_map *vdev_trimmap; + struct trim_map *vdev_trimmap; /* map on outstanding trims */ + uint16_t vdev_rotation_rate; /* rotational rate of the media */ +#define VDEV_RATE_UNKNOWN 0 +#define VDEV_RATE_NON_ROTATING 1 /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index aa1f3a7dc8628..78bed98f5b184 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -42,9 +42,11 @@ * Virtual device vector for GEOM. */ +static g_attrchanged_t vdev_geom_attrchanged; struct g_class zfs_vdev_class = { .name = "ZFS::VDEV", .version = G_VERSION, + .attrchanged = vdev_geom_attrchanged, }; DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); @@ -72,6 +74,34 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, larger_ashift_minimal, CTLFLAG_RW, #define ZFSVOL_CLASS_NAME "ZFS::ZVOL" +static void +vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) +{ + int error; + uint16_t rate; + + error = g_getattr("GEOM::rotation_rate", cp, &rate); + if (error == 0) + vd->vdev_rotation_rate = rate; + else + vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN; +} + +static void +vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) +{ + vdev_t *vd; + + vd = cp->private; + if (vd == NULL) + return; + + if (strcmp(attr, "GEOM::rotation_rate") == 0) { + vdev_geom_set_rotation_rate(vd, cp); + return; + } +} + static void vdev_geom_orphan(struct g_consumer *cp) { @@ -702,6 +732,11 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP); snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name); + /* + * Determine the device's rotation rate. + */ + vdev_geom_set_rotation_rate(vd, cp); + return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index 4ea8e6d12707b..1ffd1de2731f6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -41,27 +41,97 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; + int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; int mm_children; - int mm_replacing; - int mm_preferred; - int mm_root; - mirror_child_t mm_child[1]; + boolean_t mm_replacing; + boolean_t mm_root; + mirror_child_t mm_child[]; } mirror_map_t; -int vdev_mirror_shift = 21; +static int vdev_mirror_shift = 21; + +SYSCTL_DECL(_vfs_zfs_vdev); +static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, + "ZFS VDEV Mirror"); + +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * non_rotating_seek_inc to 0 may well provide better results as it + * will direct more reads to the non-rotating vdevs which are more + * likely to have a higher performance. + */ + +/* Rotating media load calculation configuration. */ +static int rotating_inc = 0; +TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_inc", &rotating_inc); +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RW, + &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's"); + +static int rotating_seek_inc = 5; +TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_inc", &rotating_seek_inc); +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RW, + &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's"); + +static int rotating_seek_offset = 1 * 1024 * 1024; +TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_offset", &rotating_seek_offset); +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RW, + &rotating_seek_offset, 0, "Offset in bytes from the last I/O which " + "triggers a reduced rotating media seek increment"); + +/* Non-rotating media load calculation configuration. */ +static int non_rotating_inc = 0; +TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_inc", &non_rotating_inc); +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RW, + &non_rotating_inc, 0, + "Non-rotating media load increment for non-seeking I/O's"); + +static int non_rotating_seek_inc = 1; +TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_seek_inc", + &non_rotating_seek_inc); +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RW, + &non_rotating_seek_inc, 0, + "Non-rotating media load increment for seeking I/O's"); + + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof(int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_replacing = replacing; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return mm; +} static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { @@ -69,55 +139,80 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = { zio_vsd_default_cksum_report }; +static int +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) +{ + uint64_t lastoffset; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + lastoffset = vdev_queue_lastoffset(vd); + + if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) { + /* Non-rotating media. */ + if (lastoffset == zio_offset) + return (load + non_rotating_inc); + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O'a can be aggregated into fewer operations + * on the device, thus avoiding unnecessary per-command + * overhead and boosting performance. + */ + return (load + non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (lastoffset == zio_offset) + return (load + rotating_inc); + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O queued to this vdev as they should incure less + * of a seek increment. + */ + if (ABS(lastoffset - zio_offset) < rotating_seek_offset) + return (load + (rotating_seek_inc / 2)); + + /* Apply the full seek increment to all other I/O's. */ + return (load + rotating_seek_inc); +} + + static mirror_map_t * -vdev_mirror_map_alloc(zio_t *zio) +vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; - int c, d; + int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; - c = BP_GET_NDVAS(zio->io_bp); - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = B_FALSE; - mm->mm_preferred = spa_get_random(c); - mm->mm_root = B_TRUE; - - /* - * Check the other, lower-index DVAs to see if they're on - * the same vdev as the child we picked. If they are, use - * them since they are likely to have been allocated from - * the primary metaslab in use at the time, and hence are - * more likely to have locality with single-copy data. - */ - for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { - if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) - mm->mm_preferred = d; - } - + mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, + B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; - mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { - c = vd->vdev_children; - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = mm->mm_replacing ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; - mm->mm_root = B_FALSE; - + mm = vdev_mirror_map_alloc(vd->vdev_children, + (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; @@ -211,50 +306,121 @@ vdev_mirror_scrub_done(zio_t *zio) } /* - * Try to find a child whose DTL doesn't contain the block we want to read. + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int preferred) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int c; + + for (c = preferred - 1; c >= 0; c--) { + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, mm->mm_preferred[p])); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * prefering vdevs based on determined load. + * * If we can't, try the read on any vdev we haven't already tried. */ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; uint64_t txg = zio->io_txg; - int i, c; + int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); - /* - * Try to find a child whose DTL doesn't contain the block to read. - * If a child is known to be completely inaccessible (indicated by - * vdev_readable() returning B_FALSE), don't even try. - */ - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { - if (c >= mm->mm_children) - c = 0; + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; + if (!vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) - return (c); - mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; - mc->mc_speculative = 1; + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + vdev_queue_register_lastoffset( + mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + int c = vdev_mirror_preferred_child_randomize(zio); + + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); + return (c); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ - for (c = 0; c < mm->mm_children; c++) - if (!mm->mm_child[c].mc_tried) + for (c = 0; c < mm->mm_children; c++) { + if (!mm->mm_child[c].mc_tried) { + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, + zio); return (c); + } + } /* * Every child failed. There's no place left to look. @@ -269,7 +435,7 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_alloc(zio); + mm = vdev_mirror_map_init(zio); if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 11f05a012147e..d7d0deeadd3dc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -279,6 +279,8 @@ vdev_queue_init(vdev_t *vd) vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + + vq->vq_lastoffset = 0; } void @@ -783,3 +785,26 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } + +/* + * As these three methods are only used for load calculations we're not concerned + * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex + * use here, instead we prefer to keep it lock free for performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_lastoffset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_lastoffset); +} + +void +vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) +{ + vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; +} diff --git a/sys/geom/geom.h b/sys/geom/geom.h index a20b3a609ef7c..b91b328fbdfe3 100644 --- a/sys/geom/geom.h +++ b/sys/geom/geom.h @@ -266,6 +266,7 @@ int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len); int g_handleattr_int(struct bio *bp, const char *attribute, int val); int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val); +int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val); int g_handleattr_str(struct bio *bp, const char *attribute, const char *str); struct g_consumer * g_new_consumer(struct g_geom *gp); struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) diff --git a/sys/geom/geom_disk.c b/sys/geom/geom_disk.c index 733e32452b138..e267f23b52296 100644 --- a/sys/geom/geom_disk.c +++ b/sys/geom/geom_disk.c @@ -384,22 +384,25 @@ g_disk_start(struct bio *bp) break; else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident)) break; - else if (g_handleattr(bp, "GEOM::hba_vendor", - &dp->d_hba_vendor, 2)) + else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor", + dp->d_hba_vendor)) break; - else if (g_handleattr(bp, "GEOM::hba_device", - &dp->d_hba_device, 2)) + else if (g_handleattr_uint16_t(bp, "GEOM::hba_device", + dp->d_hba_device)) break; - else if (g_handleattr(bp, "GEOM::hba_subvendor", - &dp->d_hba_subvendor, 2)) + else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor", + dp->d_hba_subvendor)) break; - else if (g_handleattr(bp, "GEOM::hba_subdevice", - &dp->d_hba_subdevice, 2)) + else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice", + dp->d_hba_subdevice)) break; else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_disk_kerneldump(bp, dp); else if (!strcmp(bp->bio_attribute, "GEOM::setstate")) g_disk_setstate(bp, sc); + else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate", + dp->d_rotation_rate)) + break; else error = ENOIOCTL; break; diff --git a/sys/geom/geom_disk.h b/sys/geom/geom_disk.h index 5e081c8c14059..989deba02ccf1 100644 --- a/sys/geom/geom_disk.h +++ b/sys/geom/geom_disk.h @@ -93,6 +93,7 @@ struct disk { uint16_t d_hba_device; uint16_t d_hba_subvendor; uint16_t d_hba_subdevice; + uint16_t d_rotation_rate; /* Fields private to the driver */ void *d_drv1; @@ -124,7 +125,8 @@ void disk_media_gone(struct disk *dp, int flag); #define DISK_VERSION_01 0x5856105a #define DISK_VERSION_02 0x5856105b #define DISK_VERSION_03 0x5856105c -#define DISK_VERSION DISK_VERSION_03 +#define DISK_VERSION_04 0x5856105d +#define DISK_VERSION DISK_VERSION_04 #endif /* _KERNEL */ #endif /* _GEOM_GEOM_DISK_H_ */ diff --git a/sys/geom/geom_subr.c b/sys/geom/geom_subr.c index 782a444b87865..aa9d0b860197a 100644 --- a/sys/geom/geom_subr.c +++ b/sys/geom/geom_subr.c @@ -874,6 +874,13 @@ g_handleattr_int(struct bio *bp, const char *attribute, int val) return (g_handleattr(bp, attribute, &val, sizeof val)); } +int +g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val) +{ + + return (g_handleattr(bp, attribute, &val, sizeof val)); +} + int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) { diff --git a/sys/sys/ata.h b/sys/sys/ata.h index f46dd50c443db..27154e7621546 100644 --- a/sys/sys/ata.h +++ b/sys/sys/ata.h @@ -262,6 +262,8 @@ struct ata_params { /*215*/ u_int16_t nv_cache_size_1; u_int16_t nv_cache_size_2; /*217*/ u_int16_t media_rotation_rate; +#define ATA_RATE_NOT_REPORTED 0x0000 +#define ATA_RATE_NON_ROTATING 0x0001 u_int16_t reserved218; /*219*/ u_int16_t nv_cache_opt; /*220*/ u_int16_t wrv_mode;