Skip to content

Commit

Permalink
Limit the amount of dnode metadata in the ARC
Browse files Browse the repository at this point in the history
Metadata-intensive workloads can cause the ARC to become permanently
filled with dnode_t objects as they're pinned by the VFS layer.
Subsequent data-intensive workloads may only benefit from about
25% of the potential ARC (arc_c_max - arc_meta_limit).

In order to help track metadata usage more precisely, the other_size
metadata arcstat has replaced with dbuf_size, dnode_size and bonus_size.

The new zfs_arc_dnode_limit tunable, which defaults to 10% of
zfs_arc_meta_limit, defines the minimum number of bytes which is desirable
to be consumed by dnodes.  Attempts to evict non-metadata will trigger
async prune tasks if the space used by dnodes exceeds this limit.

The new zfs_arc_dnode_reduce_percent tunable specifies the amount by
which the excess dnode space is attempted to be pruned as a percentage of
the amount by which zfs_arc_dnode_limit is being exceeded.  By default,
it tries to unpin 10% of the dnodes.

The problem of dnode metadata pinning was observed with the following
testing procedure (in this example, zfs_arc_max is set to 4GiB):

    - Create a large number of small files until arc_meta_used exceeds
      arc_meta_limit (3GiB with default tuning) and arc_prune
      starts increasing.

    - Create a 3GiB file with dd.  Observe arc_mata_used.  It will still
      be around 3GiB.

    - Repeatedly read the 3GiB file and observe arc_meta_limit as before.
      It will continue to stay around 3GiB.

With this modification, space for the 3GiB file is gradually made
available as subsequent demands on the ARC are made.
  • Loading branch information
dweeezil committed Jul 15, 2016
1 parent d30abeb commit 650f6cf
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 25 deletions.
4 changes: 3 additions & 1 deletion include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ typedef enum arc_space_type {
ARC_SPACE_META,
ARC_SPACE_HDRS,
ARC_SPACE_L2HDRS,
ARC_SPACE_OTHER,
ARC_SPACE_DBUF,
ARC_SPACE_DNODE,
ARC_SPACE_BONUS,
ARC_SPACE_NUMTYPES
} arc_space_type_t;

Expand Down
31 changes: 31 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,37 @@ Min time before an active prefetch stream can be reclaimed
Default value: \fB2\fR.
.RE

.sp
.ne 2
.na
\fBzfs_arc_dnode_limit\fR (ulong)
.ad
.RS 12n
When the number of bytes consumed by dnodes in the ARC exceeds this number of
bytes, try to unpin some of it in response to demand for non-metadata. This
value acts as a floor to the amount of dnode metadata.

See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used
when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather
than in response to overall demand for non-metadata.

.sp
Default value: \fB10% of zfs_arc_meta_limit\fR.
.RE

.sp
.ne 2
.na
\fBzfs_arc_dnode_reduce_percent\fR (ulong)
.ad
.RS 12n
Percentage of ARC dnodes to try to scan in response to demand for non-metadata
when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fB.

.sp
Default value: \fB10% of the number of dnodes in the ARC\fR.
.RE

.sp
.ne 2
.na
Expand Down
83 changes: 68 additions & 15 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ unsigned long zfs_arc_max = 0;
unsigned long zfs_arc_min = 0;
unsigned long zfs_arc_meta_limit = 0;
unsigned long zfs_arc_meta_min = 0;
unsigned long zfs_arc_dnode_limit = 0;
unsigned long zfs_arc_dnode_reduce_percent = 10;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
Expand Down Expand Up @@ -328,13 +330,17 @@ typedef struct arc_stats {
*/
kstat_named_t arcstat_metadata_size;
/*
* Number of bytes consumed by various buffers and structures
* not actually backed with ARC buffers. This includes bonus
* buffers (allocated directly via zio_buf_* functions),
* dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
* cache), and dnode_t structures (allocated via dnode_t cache).
* Number of bytes consumed by dmu_buf_impl_t objects.
*/
kstat_named_t arcstat_other_size;
kstat_named_t arcstat_dbuf_size;
/*
* Number of bytes consumed by dnode_t objects.
*/
kstat_named_t arcstat_dnode_size;
/*
* Number of bytes consumed by bonus buffers.
*/
kstat_named_t arcstat_bonus_size;
/*
* Total number of bytes consumed by ARC buffers residing in the
* arc_anon state. This includes *all* buffers in the arc_anon
Expand Down Expand Up @@ -472,6 +478,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_prune;
kstat_named_t arcstat_meta_used;
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_dnode_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_need_free;
Expand Down Expand Up @@ -514,7 +521,9 @@ static arc_stats_t arc_stats = {
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
{ "metadata_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
{ "dbuf_size", KSTAT_DATA_UINT64 },
{ "dnode_size", KSTAT_DATA_UINT64 },
{ "bonus_size", KSTAT_DATA_UINT64 },
{ "anon_size", KSTAT_DATA_UINT64 },
{ "anon_evictable_data", KSTAT_DATA_UINT64 },
{ "anon_evictable_metadata", KSTAT_DATA_UINT64 },
Expand Down Expand Up @@ -566,6 +575,7 @@ static arc_stats_t arc_stats = {
{ "arc_prune", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_dnode_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
Expand Down Expand Up @@ -635,9 +645,13 @@ static arc_state_t *arc_l2c_only;
#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */

Expand Down Expand Up @@ -793,6 +807,7 @@ static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
static void arc_tuning_update(void);
static void arc_prune_async(int64_t);

static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
Expand Down Expand Up @@ -1666,8 +1681,14 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
case ARC_SPACE_META:
ARCSTAT_INCR(arcstat_metadata_size, space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, space);
case ARC_SPACE_BONUS:
ARCSTAT_INCR(arcstat_bonus_size, space);
break;
case ARC_SPACE_DNODE:
ARCSTAT_INCR(arcstat_dnode_size, space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, space);
Expand Down Expand Up @@ -1697,8 +1718,14 @@ arc_space_return(uint64_t space, arc_space_type_t type)
case ARC_SPACE_META:
ARCSTAT_INCR(arcstat_metadata_size, -space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, -space);
case ARC_SPACE_BONUS:
ARCSTAT_INCR(arcstat_bonus_size, -space);
break;
case ARC_SPACE_DNODE:
ARCSTAT_INCR(arcstat_dnode_size, -space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, -space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, -space);
Expand Down Expand Up @@ -2585,16 +2612,25 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
* we're evicting all available buffers.
*/
while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
int sublist_idx = multilist_get_random_index(ml);
uint64_t scan_evicted = 0;

/*
* Try to reduce pinned dnodes with a floor of arc_dnode_limit.
* Request that 10% of the LRUs be scanned by the superblock
* shrinker.
*/
if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit)
arc_prune_async((arc_dnode_size - arc_dnode_limit) /
sizeof (dnode_t) / zfs_arc_dnode_reduce_percent);

/*
* Start eviction using a randomly selected sublist,
* this is to try and evenly balance eviction across all
* sublists. Always starting at the same sublist
* (e.g. index 0) would cause evictions to favor certain
* sublists over others.
*/
int sublist_idx = multilist_get_random_index(ml);
uint64_t scan_evicted = 0;

for (i = 0; i < num_sublists; i++) {
uint64_t bytes_remaining;
uint64_t bytes_evicted;
Expand Down Expand Up @@ -5250,7 +5286,8 @@ arc_tuning_update(void)
arc_c_max = zfs_arc_max;
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
arc_meta_limit = MIN(arc_meta_limit, arc_c_max);
arc_meta_limit = MIN(arc_meta_limit, (3 * arc_c_max) / 4);
arc_dnode_limit = arc_meta_limit / 10;
}

/* Valid range: 32M - <arc_c_max> */
Expand All @@ -5267,6 +5304,7 @@ arc_tuning_update(void)
(zfs_arc_meta_min <= arc_c_max)) {
arc_meta_min = zfs_arc_meta_min;
arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
arc_dnode_limit = arc_meta_limit / 10;
}

/* Valid range: <arc_meta_min> - <arc_c_max> */
Expand All @@ -5275,6 +5313,12 @@ arc_tuning_update(void)
(zfs_arc_meta_limit <= arc_c_max))
arc_meta_limit = zfs_arc_meta_limit;

/* Valid range: <arc_meta_min> - <arc_c_max> */
if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
(zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
(zfs_arc_dnode_limit <= arc_c_max))
arc_dnode_limit = zfs_arc_dnode_limit;

/* Valid range: 1 - N */
if (zfs_arc_grow_retry)
arc_grow_retry = zfs_arc_grow_retry;
Expand Down Expand Up @@ -5363,6 +5407,8 @@ arc_init(void)
arc_meta_max = 0;
/* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */
arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min);
/* Default dnode limit is 10% of overall meta limit */
arc_dnode_limit = arc_meta_limit / 10;

/* Apply user specified tunings */
arc_tuning_update();
Expand Down Expand Up @@ -7090,4 +7136,11 @@ MODULE_PARM_DESC(zfs_arc_lotsfree_percent,
module_param(zfs_arc_sys_free, ulong, 0644);
MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes");

module_param(zfs_arc_dnode_limit, ulong, 0644);
MODULE_PARM_DESC(zfs_arc_dnode_limit, "Minimum bytes of dnodes in arc");

module_param(zfs_arc_dnode_reduce_percent, ulong, 0644);
MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent,
"Percentage of excess dnodes to try to unpin");

#endif
14 changes: 7 additions & 7 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)

ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
if (bonuslen)
Expand Down Expand Up @@ -884,7 +884,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
Expand Down Expand Up @@ -1733,7 +1733,7 @@ dbuf_clear(dmu_buf_impl_t *db)
ASSERT(db->db.db_data != NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
}
db->db.db_data = NULL;
db->db_state = DB_UNCACHED;
Expand Down Expand Up @@ -1892,7 +1892,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
return (db);
} else if (blkid == DMU_SPILL_BLKID) {
db->db.db_size = (blkptr != NULL) ?
Expand Down Expand Up @@ -1926,7 +1926,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);

if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db);
Expand Down Expand Up @@ -2003,7 +2003,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(db->db_data_pending == NULL);

kmem_cache_free(dbuf_cache, db);
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
}

void
Expand Down Expand Up @@ -2649,7 +2649,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)

if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_BONUS);
}
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dnh->dnh_dnode = dn;
mutex_exit(&os->os_lock);

arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
return (dn);
}

Expand Down Expand Up @@ -526,7 +526,7 @@ dnode_destroy(dnode_t *dn)

dmu_zfetch_rele(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);

if (complete_os_eviction)
dmu_objset_evict_done(os);
Expand Down

0 comments on commit 650f6cf

Please sign in to comment.