diff --git a/include/sys/spa.h b/include/sys/spa.h index 401ae8343e96..a4a717ac0450 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -51,6 +51,8 @@ typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; +typedef struct zbookmark zbookmark_t; + struct dsl_pool; struct dsl_dataset; @@ -534,6 +536,25 @@ extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) +/* Historical pool statistics */ +typedef struct spa_stats_history { + kmutex_t lock; + uint64_t count; + uint64_t size; + kstat_t *kstat; + void *private; + list_t list; +} spa_stats_history_t; + +typedef struct spa_stats { + spa_stats_history_t read_history; +} spa_stats_t; + +extern void spa_stats_init(spa_t *spa); +extern void spa_stats_destroy(spa_t *spa); +extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, + uint32_t aflags); + /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 47dfe432eed2..107d14a76cfd 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -236,6 +236,8 @@ struct spa { uint64_t spa_deadman_calls; /* number of deadman calls */ uint64_t spa_sync_starttime; /* starting time fo spa_sync */ uint64_t spa_deadman_synctime; /* deadman expiration timer */ + spa_stats_t spa_stats; /* assorted spa statistics */ + /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index a126c058ebd9..dfd11d9f1e5e 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -210,6 +210,7 @@ typedef struct kthread { } kthread_t; #define curthread zk_thread_current() +#define getcomm() "unknown" #define thread_exit zk_thread_exit #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ @@ -347,6 +348,10 @@ extern kstat_t *kstat_create(char *, int, char *, char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); +extern void kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)); /* * Kernel memory diff --git a/include/sys/zio.h b/include/sys/zio.h index 189966bef54c..f5a128e0b319 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -256,12 +256,13 @@ extern char *zio_type_name[ZIO_TYPES]; * Therefore it must not change size or alignment between 32/64 bit * compilation options. */ -typedef struct zbookmark { +struct zbookmark { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; -} zbookmark_t; + char * zb_func; +}; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ @@ -269,6 +270,7 @@ typedef struct zbookmark { (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ + (zb)->zb_func = FTAG; \ } #define ZB_DESTROYED_OBJSET (-1ULL) diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index da81659d240c..b2d6394546a8 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -62,6 +62,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/spa_errlog.c \ $(top_srcdir)/module/zfs/spa_history.c \ $(top_srcdir)/module/zfs/spa_misc.c \ + $(top_srcdir)/module/zfs/spa_stats.c \ $(top_srcdir)/module/zfs/space_map.c \ $(top_srcdir)/module/zfs/txg.c \ $(top_srcdir)/module/zfs/uberblock.c \ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index e4d645cf7797..57a3739e23e8 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -240,6 +240,14 @@ void kstat_delete(kstat_t *ksp) {} +/*ARGSUSED*/ +void +kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{} + /* * ========================================================================= * mutexes diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index e71228454110..6f0f6ef05ae1 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -44,6 +44,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o +$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1298c5b91bc6..366f5bf9b5da 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2943,6 +2943,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + int rc = 0; top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -2976,10 +2977,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr->b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); - return (0); + goto out; } mutex_exit(hash_lock); - return (0); + goto out; } ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -3174,12 +3175,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); - return (0); + goto out; } ASSERT(*arc_flags & ARC_WAIT); if (zio_wait(rzio) == 0) - return (0); + goto out; /* l2arc read error; goto zio_read() */ } else { @@ -3203,13 +3204,18 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) - return (zio_wait(rzio)); + if (*arc_flags & ARC_WAIT) { + rc = zio_wait(rzio); + goto out; + } ASSERT(*arc_flags & ARC_NOWAIT); zio_nowait(rzio); } - return (0); + +out: + spa_read_history_add(spa, zb, *arc_flags); + return (rc); } arc_prune_t * diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 32b3e50fc005..1827ffc7413f 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); + SET_BOOKMARK(czb, td->td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { uint32_t flags = ARC_WAIT; @@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb); if (err != 0) return (err); @@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, td, TQ_NOQUEUE)) pd->pd_exited = B_TRUE; - SET_BOOKMARK(czb, td->td_objset, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(td, NULL, rootbp, czb); mutex_enter(&pd->pd_mtx); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9ba6e896338b..a0b4797c97cb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "zfs_prop.h" #include "zfeature_common.h" @@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL; */ int zfs_deadman_enabled = 1; - /* * ========================================================================== * SPA config locking @@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); + spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); @@ -580,6 +581,7 @@ spa_remove(spa_t *spa) refcount_destroy(&spa->spa_refcount); + spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (t = 0; t < TXG_SIZE; t++) diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c new file mode 100644 index 000000000000..271a3c312f7e --- /dev/null +++ b/module/zfs/spa_stats.c @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include + +/* + * Keeps stats on last N reads per spa_t, disabled by default. + */ +int zfs_read_history = 0; + +/* + * Include cache hits in history, disabled by default. + */ +int zfs_read_history_hits = 0; + +/* + * ========================================================================== + * SPA Read History Routines + * ========================================================================== + */ + +/* + * Read statistics - Information exported regarding each arc_read call + */ +typedef struct spa_read_history { + uint64_t uid; /* unique identifier */ + hrtime_t start; /* time read completed */ + uint64_t objset; /* read from this objset */ + uint64_t object; /* read of this object number */ + uint64_t level; /* block's indirection level */ + uint64_t blkid; /* read of this block id */ + char origin[24]; /* read originated from here */ + uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ + pid_t pid; /* PID of task doing read */ + char comm[16]; /* process name of task doing read */ + list_node_t srh_link; +} spa_read_history_t; + +static int +spa_read_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", + "level", "blkid", "aflags", "origin", "pid", "process"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_read_history_data(char *buf, size_t size, void *data) +{ + spa_read_history_t *srh = (spa_read_history_t *)data; + + size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " + "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", + (u_longlong_t)srh->uid, srh->start, + (longlong_t)srh->objset, (longlong_t)srh->object, + (longlong_t)srh->level, (longlong_t)srh->blkid, + srh->aflags, srh->origin, srh->pid, srh->comm); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_read_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_read_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_read_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + if (rw == KSTAT_WRITE) { + spa_read_history_t *srh; + + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t); + + return (0); +} + +static void +spa_read_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_read_history_t), + offsetof(spa_read_history_t, srh_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "reads", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_read_history_update; + kstat_set_raw_ops(ksp, spa_read_history_headers, + spa_read_history_data, spa_read_history_addr); + kstat_install(ksp); + } +} + +static void +spa_read_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh, *rm; + + ASSERT3P(spa, !=, NULL); + ASSERT3P(zb, !=, NULL); + + if (zfs_read_history == 0 && ssh->size == 0) + return; + + if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + return; + + srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE); + strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin)); + strlcpy(srh->comm, getcomm(), sizeof(srh->comm)); + srh->start = gethrtime(); + srh->objset = zb->zb_objset; + srh->object = zb->zb_object; + srh->level = zb->zb_level; + srh->blkid = zb->zb_blkid; + srh->aflags = aflags; + srh->pid = getpid(); + + mutex_enter(&ssh->lock); + + srh->uid = ssh->count++; + list_insert_head(&ssh->list, srh); + ssh->size++; + + while (ssh->size > zfs_read_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_read_history_t)); + } + + mutex_exit(&ssh->lock); +} + +void +spa_stats_init(spa_t *spa) +{ + spa_read_history_init(spa); +} + +void +spa_stats_destroy(spa_t *spa) +{ + spa_read_history_destroy(spa); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_read_history, int, 0644); +MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); + +module_param(zfs_read_history_hits, int, 0644); +MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); +#endif