Skip to content

Commit

Permalink
Balance mirror reads via pending queue length and reqtime
Browse files Browse the repository at this point in the history
When selecting which vdev mirror child to read from,
redirect io's to child vdevs with the smallest pending
queue lengths.

During the vdev selection process if a readable vdev is found
with a queue length of 0 then the vdev will be used and the
above process will be cut short.

It is hoped that this will cause all available vdevs to be
utilised while ensuring the more capable devices take on
more of the workload.

closes #1461
  • Loading branch information
b333z committed May 30, 2013
1 parent 0377189 commit ac0df7e
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 2 deletions.
Empty file added .nogitrelease
Empty file.
1 change: 1 addition & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,7 @@ typedef struct vdev_stat {
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_request_time_average;/* avg. request time <<8*/
} vdev_stat_t;

/*
Expand Down
1 change: 1 addition & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
extern boolean_t vdev_dtl_required(vdev_t *vd);
extern boolean_t vdev_resilver_needed(vdev_t *vd,
uint64_t *minp, uint64_t *maxp);
extern uint64_t vdev_pending_queued(vdev_t *vd);

extern void vdev_hold(vdev_t *);
extern void vdev_rele(vdev_t *);
Expand Down
21 changes: 20 additions & 1 deletion module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -1948,6 +1948,23 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
return (needed);
}

uint64_t
vdev_pending_queued(vdev_t *vd)
{
int pending;
uint64_t estimate;
vdev_queue_t *vq = &vd->vdev_queue;
vdev_stat_t *vs = &vd->vdev_stat;

mutex_enter(&vq->vq_lock);
pending = avl_numnodes(&vq->vq_pending_tree);
mutex_exit(&vq->vq_lock);
pending++;
estimate = vs->vs_request_time_average >> 8;
estimate = estimate * pending;
return (estimate);
}

void
vdev_load(vdev_t *vd)
{
Expand Down Expand Up @@ -2614,7 +2631,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)

vs->vs_ops[type]++;
vs->vs_bytes[type] += psize;

if (zio->io_timestamp > 0) {
vs->vs_request_time_average += ((uint64_t)(ddi_get_lbolt64() - zio->io_timestamp + 1) << 8) - (vs->vs_request_time_average >> 8);
}
mutex_exit(&vd->vdev_stat_lock);
return;
}
Expand Down
35 changes: 34 additions & 1 deletion module/zfs/vdev_mirror.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
#include <sys/zio.h>
#include <sys/fs/zfs.h>

int zfs_vdev_mirror_pending_balance = 0;

/*
* Virtual device vector for mirroring.
*/
Expand Down Expand Up @@ -221,7 +223,10 @@ vdev_mirror_child_select(zio_t *zio)
mirror_map_t *mm = zio->io_vsd;
mirror_child_t *mc;
uint64_t txg = zio->io_txg;
int pending_lowest_child = -1;
uint64_t pending_lowest_count = UINT64_MAX;
int i, c;
uint64_t pending;

ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);

Expand All @@ -243,12 +248,35 @@ vdev_mirror_child_select(zio_t *zio)
continue;
}
if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
return (c);
{
if (!zfs_vdev_mirror_pending_balance) /* balance disabled */
return (c);
pending = vdev_pending_queued(mc->mc_vd);
if (pending == 0) {
return (c);
}
if (pending < pending_lowest_count) {
pending_lowest_count = pending;
pending_lowest_child = c;
}
else if (pending == pending_lowest_count) {
if ( c == mm->mm_preferred)
pending_lowest_child = c;
}
continue;
}
mc->mc_error = ESTALE;
mc->mc_skipped = 1;
mc->mc_speculative = 1;
}

/*
* See if we found multiple devices with pending io's
* and return the child with smallest queue.
*/
if ( pending_lowest_child != -1 )
return (pending_lowest_child);

/*
* Every device is either missing or has this txg in its DTL.
* Look for any child we haven't already tried before giving up.
Expand Down Expand Up @@ -492,3 +520,8 @@ vdev_ops_t vdev_spare_ops = {
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};

#if defined(_KERNEL) && defined(HAVE_SPL)
module_param(zfs_vdev_mirror_pending_balance, int, 0644);
MODULE_PARM_DESC(zfs_vdev_mirror_pending_balance, "Balance reads from mirror vdev based on member speed and pending queue depth");
#endif

0 comments on commit ac0df7e

Please sign in to comment.