Skip to content

Commit

Permalink
Illumos 5987 - zfs prefetch code needs work
Browse files Browse the repository at this point in the history
5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>

References:
  https://www.illumos.org/issues/5987 zfs prefetch code needs work
  illumos/illumos-gate@cf6106c 5987 zfs prefetch code needs work

Porting notes:
- [module/zfs/dbuf.c]
  - 5f6d0b6 Handle block pointers with a corrupt logical size
- [module/zfs/dmu_zfetch.c]
  - c65aa5b Fix gcc missing parenthesis warnings
  - 428870f Update core ZFS code from build 121 to build 141.
  - 79c76d5 Change KM_PUSHPAGE -> KM_SLEEP
  - b8d06fc Switch KM_SLEEP to KM_PUSHPAGE
  - Account for ISO C90 - mixed declarations and code - warnings
  - Module parameters (new/changed):
    - Replaced zfetch_block_cap with zfetch_max_distance
      (Max bytes to prefetch per stream (default 8MB; 8 * 1024 * 1024))
    - Preserved zfs_prefetch_disable as 'int' for consistency with
      existing Linux module options.
- [include/sys/trace_arc.h]
  - Added new tracepoints
    - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
    - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
- [man/man5/zfs-module-parameters.5]
  - Updated man page

Ported-by: kernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
  • Loading branch information
ahrens authored and behlendorf committed Jan 12, 2016
1 parent ab5cbbd commit 7f60329
Show file tree
Hide file tree
Showing 10 changed files with 280 additions and 658 deletions.
28 changes: 16 additions & 12 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,27 +84,31 @@ typedef enum arc_flags
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */

/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */
/* Indicates that block was read with ASYNC priority. */
ARC_FLAG_PRIO_ASYNC_READ = 1 << 14,
ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 16,
ARC_FLAG_BUFC_METADATA = 1 << 18,

/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 17,
ARC_FLAG_HAS_L2HDR = 1 << 18,
ARC_FLAG_HAS_L1HDR = 1 << 19,
ARC_FLAG_HAS_L2HDR = 1 << 20,

} arc_flags_t;

struct arc_buf {
Expand Down
3 changes: 2 additions & 1 deletion include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,8 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
* individually with dmu_buf_rele.
*/
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
uint64_t length, boolean_t read, void *tag,
int *numbufsp, dmu_buf_t ***dbpp);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);

typedef void dmu_buf_evict_func_t(void *user_ptr);
Expand Down
37 changes: 15 additions & 22 deletions include/sys/dmu_zfetch.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,12 @@
* Use is subject to license terms.
*/

#ifndef _DFETCH_H
#define _DFETCH_H
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/

#ifndef _DMU_ZFETCH_H
#define _DMU_ZFETCH_H

#include <sys/zfs_context.h>

Expand All @@ -36,41 +40,30 @@ extern unsigned long zfetch_array_rd_sz;

struct dnode; /* so we can reference dnode */

typedef enum zfetch_dirn {
ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
} zfetch_dirn_t;

typedef struct zstream {
uint64_t zst_offset; /* offset of starting block in range */
uint64_t zst_len; /* length of range, in blocks */
zfetch_dirn_t zst_direction; /* direction of prefetch */
uint64_t zst_stride; /* length of stride, in blocks */
uint64_t zst_ph_offset; /* prefetch offset, in blocks */
uint64_t zst_cap; /* prefetch limit (cap), in blocks */
kmutex_t zst_lock; /* protects stream */
clock_t zst_last; /* lbolt of last prefetch */
list_node_t zst_node; /* next zstream here */
uint64_t zs_blkid; /* expect next access at this blkid */
uint64_t zs_pf_blkid; /* next block to prefetch */
kmutex_t zs_lock; /* protects stream */
hrtime_t zs_atime; /* time last prefetch issued */
list_node_t zs_node; /* link for zf_stream */
} zstream_t;

typedef struct zfetch {
krwlock_t zf_rwlock; /* protects zfetch structure */
list_t zf_stream; /* AVL tree of zstream_t's */
list_t zf_stream; /* list of zstream_t's */
struct dnode *zf_dnode; /* dnode that owns this zfetch */
uint32_t zf_stream_cnt; /* # of active streams */
uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
} zfetch_t;

void zfetch_init(void);
void zfetch_fini(void);

void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_rele(zfetch_t *);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
void dmu_zfetch_fini(zfetch_t *);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t);


#ifdef __cplusplus
}
#endif

#endif /* _DFETCH_H */
#endif /* _DMU_ZFETCH_H */
2 changes: 2 additions & 0 deletions include/sys/trace_arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);

Expand Down
11 changes: 7 additions & 4 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -331,12 +331,12 @@ Default value: \fB1,048,576\fR.
.sp
.ne 2
.na
\fBzfetch_block_cap\fR (uint)
\fBzfetch_max_distance\fR (uint)
.ad
.RS 12n
Max number of blocks to prefetch at a time
Max bytes to prefetch per stream (default 8MB).
.sp
Default value: \fB256\fR.
Default value: \fB8,388,608\fR.
.RE

.sp
Expand Down Expand Up @@ -1246,7 +1246,10 @@ Default value: \fB52,428,800\fR.
\fBzfs_prefetch_disable\fR (int)
.ad
.RS 12n
Disable all ZFS prefetching
This tunable disables predictive prefetch. Note that it leaves "prescient"
prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
prescient prefetch never issues i/os that end up not being needed, so it
can't hurt performance.
.sp
Use \fB1\fR for yes and \fB0\fR for no (default).
.RE
Expand Down
74 changes: 67 additions & 7 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,8 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
kstat_named_t arcstat_need_free;
kstat_named_t arcstat_sys_free;
} arc_stats_t;
Expand Down Expand Up @@ -568,6 +570,8 @@ static arc_stats_t arc_stats = {
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
{ "arc_sys_free", KSTAT_DATA_UINT64 }
};
Expand Down Expand Up @@ -4244,6 +4248,36 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,

if (HDR_IO_IN_PROGRESS(hdr)) {

if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
* This sync read must wait for an
* in-progress async read (e.g. a predictive
* prefetch). Async reads are queued
* separately at the vdev_queue layer, so
* this is a form of priority inversion.
* Ideally, we would "inherit" the demand
* i/o's priority by moving the i/o from
* the async queue to the synchronous queue,
* but there is currently no mechanism to do
* so. Track this so that we can evaluate
* the magnitude of this potential performance
* problem.
*
* Note that if the prefetch i/o is already
* active (has been issued to the device),
* the prefetch improved performance, because
* we issued it sooner than we would have
* without the prefetch.
*/
DTRACE_PROBE1(arc__sync__wait__for__async,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}

if (*arc_flags & ARC_FLAG_WAIT) {
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
Expand All @@ -4252,7 +4286,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);

if (done) {
arc_callback_t *acb = NULL;
arc_callback_t *acb = NULL;

acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP);
Expand All @@ -4277,6 +4311,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
hdr->b_l1hdr.b_state == arc_mfu);

if (done) {
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
/*
* This is a demand read which does not have to
* wait for i/o because we did a predictive
* prefetch i/o for it, which has completed.
*/
DTRACE_PROBE1(
arc__demand__hit__predictive__prefetch,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(
arcstat_demand_hit_predictive_prefetch);
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}
add_reference(hdr, hash_lock, private);
/*
* If this block is already in use, create a new
Expand Down Expand Up @@ -4349,12 +4396,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
goto top; /* restart the IO request */
}

/* if this is a prefetch, we don't have a reference */
if (*arc_flags & ARC_FLAG_PREFETCH) {
/*
* If there is a callback, we pass our reference to
* it; otherwise we remove our reference.
*/
if (done == NULL) {
(void) remove_reference(hdr, hash_lock,
private);
hdr->b_flags |= ARC_FLAG_PREFETCH;
}
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
Expand All @@ -4377,11 +4428,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);

/* if this is a prefetch, we don't have a reference */
/*
* If there is a callback, we pass a reference to it.
*/
if (done != NULL)
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
else
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
Expand All @@ -4399,6 +4452,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
arc_access(hdr, hash_lock);
}

if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));

acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
Expand Down Expand Up @@ -4438,6 +4493,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);

if (priority == ZIO_PRIORITY_ASYNC_READ)
hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
else
hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;

if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true:
Expand Down
21 changes: 9 additions & 12 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
}

static int
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
dnode_t *dn;
zbookmark_phys_t zb;
Expand Down Expand Up @@ -723,7 +723,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
mutex_exit(&db->db_mtx);
return (0);
}
Expand All @@ -746,10 +745,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)

err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
if (aflags & ARC_FLAG_CACHED)
*flags |= DB_RF_CACHED;

return (SET_ERROR(err));
}
Expand Down Expand Up @@ -784,8 +781,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
Expand All @@ -795,13 +791,12 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (zio == NULL)
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

err = dbuf_read_impl(db, zio, &flags);
err = dbuf_read_impl(db, zio, flags);

/* dbuf_read_impl has dropped db_mtx for us */

if (!err && prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, flags & DB_RF_CACHED);
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);

if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
Expand All @@ -820,8 +815,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
Expand Down Expand Up @@ -2143,6 +2137,9 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));

if (blkid > dn->dn_maxblkid)
return;

if (dnode_block_freed(dn, blkid))
return;

Expand Down
Loading

0 comments on commit 7f60329

Please sign in to comment.