diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 50f54a36b0f9..3a231c534d89 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -199,33 +199,6 @@ static void metaslab_set_fragmentation(metaslab_t *); kmem_cache_t *metaslab_alloc_trace_cache; -/* - * How many TXG's worth of updates should be aggregated per TRIM/UNMAP - * issued to the underlying vdev. We keep two range trees of extents - * (called "trim sets") to be trimmed per metaslab, the `current' and - * the `previous' TS. New free's are added to the current TS. Then, - * once `zfs_txgs_per_trim' transactions have elapsed, the `current' - * TS becomes the `previous' TS and a new, blank TS is created to be - * the new `current', which will then start accumulating any new frees. - * Once another zfs_txgs_per_trim TXGs have passed, the previous TS's - * extents are trimmed, the TS is destroyed and the current TS again - * becomes the previous TS. - * This serves to fulfill two functions: aggregate many small frees - * into fewer larger trim operations (which should help with devices - * which do not take so kindly to them) and to allow for disaster - * recovery (extents won't get trimmed immediately, but instead only - * after passing this rather long timeout, thus preserving - * 'zfs import -F' functionality). - * The exact default value of this tunable is a tradeoff between: - * 1) Keeping the trim commands reasonably small. - * 2) Keeping the ability to rollback back for as many txgs as possible. - * 3) Waiting around too long that the user starts to get uneasy about not - * seeing any space being freed after they remove some files. - * The default value of 32 is the maximum number of uberblocks in a vdev - * label, assuming a 4k physical sector size (which seems to be the almost - * universal smallest sector size used in SSDs). - */ -unsigned int zfs_txgs_per_trim = 32; /* * Maximum number of bytes we'll put into a single zio_trim. This is for * vdev queue processing purposes and also because some devices advertise @@ -236,13 +209,11 @@ uint64_t zfs_max_bytes_per_trim = 128 << 20; static void metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size); static void metaslab_trim_add(void *arg, uint64_t offset, uint64_t size); +static uint64_t metaslab_trimming_space(const metaslab_t *msp); static zio_t *metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim); -static metaslab_trimset_t *metaslab_new_trimset(uint64_t txg, kmutex_t *lock); -static void metaslab_free_trimset(metaslab_trimset_t *ts); -static boolean_t metaslab_check_trim_conflict(metaslab_t *msp, - uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit); +static void metaslab_free_trimset(range_tree_t *ts); /* * ========================================================================== @@ -523,7 +494,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg) } msp_free_space = range_tree_space(msp->ms_tree) + allocated + - msp->ms_deferspace + range_tree_space(msp->ms_freedtree); + msp->ms_deferspace + range_tree_space(msp->ms_freedtree) + + metaslab_trimming_space(msp); VERIFY3U(sm_free_space, ==, msp_free_space); } @@ -1147,20 +1119,16 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) * tree looking for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, - uint64_t size, uint64_t align) +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); - for (; rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t offset = P2ROUNDUP(rs->rs_start, align); - - if (offset + size <= rs->rs_end && - !metaslab_check_trim_conflict(msp, &offset, size, align, - rs->rs_end)) { - *cursor = offset + size; - return (offset); + while (rs != NULL) { + if (rs->rs_start + size <= rs->rs_end) { + *cursor = rs->rs_start + size; + return (rs->rs_start); } + rs = AVL_NEXT(t, rs); } /* @@ -1171,35 +1139,9 @@ metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, return (-1ULL); *cursor = 0; - return (metaslab_block_picker(msp, t, cursor, size, align)); + return (metaslab_block_picker(t, cursor, size)); } -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ -static uint64_t -metaslab_ff_alloc(metaslab_t *msp, uint64_t size) -{ - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area of the metaslab (i.e. same cursor - * bucket) but it does not guarantee that other allocations sizes - * may exist in the same region. - */ - uint64_t align = size & -size; - uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_tree->rt_root; - - return (metaslab_block_picker(msp, t, cursor, size, align)); -} - -static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc -}; - /* * ========================================================================== * Dynamic block allocator - @@ -1241,7 +1183,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) *cursor = 0; } - return (metaslab_block_picker(msp, t, cursor, size, 1ULL)); + return (metaslab_block_picker(t, cursor, size)); } static metaslab_ops_t metaslab_df_ops = { @@ -1275,16 +1217,11 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs; rs = avl_last(&msp->ms_size_tree); - for (; rs != NULL && rs->rs_end - rs->rs_start >= size; - rs = AVL_PREV(&msp->ms_size_tree, rs)) { - if (!metaslab_check_trim_conflict(msp, cursor, size, - 1, *cursor_end)) { - /* segment appears to be acceptable */ - break; - } - } if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); + + *cursor = rs->rs_start; + *cursor_end = rs->rs_end; } offset = *cursor; @@ -1321,8 +1258,6 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_block_maxsize(msp); - /* mutable copy for adjustment by metaslab_check_trim_conflict */ - uint64_t adjustable_start; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); @@ -1334,12 +1269,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) rsearch.rs_end = *cursor + size; rs = avl_find(t, &rsearch, &where); - if (rs != NULL) - adjustable_start = rs->rs_start; - if (rs == NULL || rs->rs_end - adjustable_start < size || - metaslab_check_trim_conflict(msp, &adjustable_start, size, 1, - rs->rs_end)) { - /* segment not usable, try the largest remaining one */ + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { t = &msp->ms_size_tree; rsearch.rs_start = 0; @@ -1349,17 +1279,13 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) if (rs == NULL) rs = avl_nearest(t, where, AVL_AFTER); ASSERT(rs != NULL); - adjustable_start = rs->rs_start; - if (rs->rs_end - adjustable_start < size || - metaslab_check_trim_conflict(msp, &adjustable_start, - size, 1, rs->rs_end)) { - /* even largest remaining segment not usable */ - return (-1ULL); - } } - *cursor = adjustable_start + size; - return (*cursor); + if ((rs->rs_end - rs->rs_start) >= size) { + *cursor = rs->rs_start + size; + return (rs->rs_start); + } + return (-1ULL); } static metaslab_ops_t metaslab_ndf_ops = { @@ -1423,6 +1349,14 @@ metaslab_load(metaslab_t *msp) range_tree_walk(msp->ms_defertree[t], metaslab_trim_remove, msp); } + /* + * If there's a trim ongoing, punch out the holes that will + * be filled back in in metaslab_trim_done. + */ + if (msp->ms_trimming_ts != NULL) { + range_tree_walk(msp->ms_trimming_ts, range_tree_remove, + msp->ms_tree); + } msp->ms_max_size = metaslab_block_maxsize(msp); } cv_broadcast(&msp->ms_load_cv); @@ -1473,7 +1407,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ASSERT(ms->ms_sm != NULL); } - ms->ms_cur_ts = metaslab_new_trimset(0, &ms->ms_lock); + ms->ms_cur_ts = range_tree_create(NULL, NULL, &ms->ms_lock); /* * We create the main range tree here, but we don't create the @@ -2152,6 +2086,10 @@ metaslab_should_condense(metaslab_t *msp) segsz = entries * sizeof (uint64_t); optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); + if (msp->ms_trimming_ts != NULL) { + optimal_size += sizeof (uint64_t) * + avl_numnodes(&msp->ms_trimming_ts->rt_root); + } object_size = space_map_length(msp->ms_sm); dmu_object_info_from_db(sm->sm_dbuf, &doi); @@ -2183,7 +2121,9 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, msp->ms_group->mg_vd->vdev_spa->spa_name, - space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), + space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root) + + (msp->ms_trimming_ts != NULL ? + avl_numnodes(&msp->ms_trimming_ts->rt_root) : 0), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; @@ -2244,7 +2184,21 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_tree, SM_FREE, tx); + if (msp->ms_trimming_ts == NULL) { + space_map_write(sm, msp->ms_tree, SM_FREE, tx); + } else { + /* + * While trimming, the stuff being trimmed isn't in ms_tree, + * but we still want our persistent state to reflect that. So + * we construct a temporary union of the two trees. + */ + range_tree_t *rt = range_tree_create(NULL, NULL, &msp->ms_lock); + range_tree_walk(msp->ms_tree, range_tree_add, rt); + range_tree_walk(msp->ms_trimming_ts, range_tree_add, rt); + space_map_write(sm, rt, SM_FREE, tx); + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); + } msp->ms_condensing = B_FALSE; cv_broadcast(&msp->ms_condensing_cv); } @@ -2344,6 +2298,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); + if (msp->ms_trimming_ts != NULL) { + /* Stuff currently being trimmed is also free. */ + space_map_histogram_add(msp->ms_sm, + msp->ms_trimming_ts, tx); + } + /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus @@ -2946,11 +2906,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same - * metaslab. Keep in mind, it might have happened due - * to an ongoing trim. + * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize) || - msp->ms_trimming_ts != NULL); + ASSERT(!metaslab_should_allocate(msp, asize)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); @@ -3255,12 +3213,16 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); - VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, - msp->ms_size); + VERIFY3U(range_tree_space(msp->ms_tree) + size + + metaslab_trimming_space(msp), <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY(!range_tree_contains(msp->ms_alloctree[txg & TXG_MASK], offset, size)); + if (msp->ms_trimming_ts != NULL) { + VERIFY(!range_tree_contains(msp->ms_trimming_ts, + offset, size)); + } range_tree_add(msp->ms_tree, offset, size); msp->ms_max_size = metaslab_block_maxsize(msp); if (spa_get_auto_trim(spa) == SPA_AUTO_TRIM_ON && @@ -3503,16 +3465,18 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp) mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { range_tree_verify(msp->ms_tree, offset, size); + if (msp->ms_trimming_ts) { + range_tree_verify(msp->ms_trimming_ts, + offset, size); + } #ifdef DEBUG - VERIFY3P(&msp->ms_lock, ==, - msp->ms_cur_ts->ts_tree->rt_lock); - range_tree_verify(msp->ms_cur_ts->ts_tree, - offset, size); + VERIFY3P(&msp->ms_lock, ==, msp->ms_cur_ts->rt_lock); + range_tree_verify(msp->ms_cur_ts, offset, size); if (msp->ms_prev_ts != NULL) { VERIFY3P(&msp->ms_lock, ==, - msp->ms_prev_ts->ts_tree->rt_lock); - range_tree_verify(msp->ms_prev_ts->ts_tree, - offset, size); + msp->ms_prev_ts->rt_lock); + range_tree_verify(msp->ms_prev_ts, offset, + size); } #endif } @@ -3572,7 +3536,7 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, * from the last cursor position, but not more than the trim run * limit. */ - range_tree_vacate(msp->ms_cur_ts->ts_tree, NULL, NULL); + range_tree_vacate(msp->ms_cur_ts, NULL, NULL); rsearch.rs_start = cur; rsearch.rs_end = cur + SPA_MINBLOCKSIZE; @@ -3604,7 +3568,7 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, if (trimmed_space != 0) { /* Force this trim to take place ASAP. */ msp->ms_prev_ts = msp->ms_cur_ts; - msp->ms_cur_ts = metaslab_new_trimset(0, &msp->ms_lock); + msp->ms_cur_ts = range_tree_create(NULL, NULL, &msp->ms_lock); trim_io = metaslab_exec_trim(msp, B_FALSE); ASSERT(trim_io != NULL); @@ -3635,11 +3599,11 @@ metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size) { metaslab_t *msp = arg; - range_tree_clear(msp->ms_cur_ts->ts_tree, offset, size); + range_tree_clear(msp->ms_cur_ts, offset, size); if (msp->ms_prev_ts != NULL) - range_tree_clear(msp->ms_prev_ts->ts_tree, offset, size); + range_tree_clear(msp->ms_prev_ts, offset, size); ASSERT(msp->ms_trimming_ts == NULL || - !range_tree_contains(msp->ms_trimming_ts->ts_tree, offset, size)); + !range_tree_contains(msp->ms_trimming_ts, offset, size)); } /* @@ -3654,16 +3618,25 @@ metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_cur_ts != NULL); - range_tree_add(msp->ms_cur_ts->ts_tree, offset, size); - if (msp->ms_prev_ts != NULL) { - ASSERT(!range_tree_contains_part(msp->ms_prev_ts->ts_tree, - offset, size)); - } + range_tree_add(msp->ms_cur_ts, offset, size); + ASSERT(msp->ms_prev_ts == NULL || + !range_tree_contains_part(msp->ms_prev_ts, offset, size)); +} + +/* + * Returns the amount of space currently being trimmed. + */ +static uint64_t +metaslab_trimming_space(const metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_trimming_ts == NULL) + return (0); + return (range_tree_space(msp->ms_trimming_ts)); } /* - * Does a metaslab's automatic trim operation processing. This function - * issues trims in intervals as dictated by the zfs_txgs_per_trim tunable. + * Does a metaslab's automatic trim operation processing. * If the previous trimset has not yet finished trimming, this function * decides what to do based on `preserve_spilled'. If preserve_spilled is * false, the next trimset which would have been issued is simply dropped to @@ -3671,78 +3644,55 @@ metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) * trimset. */ void -metaslab_auto_trim(metaslab_t *msp, uint64_t txg, boolean_t preserve_spilled) +metaslab_auto_trim(metaslab_t *msp, boolean_t preserve_spilled) { - /* for atomicity */ - uint64_t txgs_per_trim = zfs_txgs_per_trim; - ASSERT(!MUTEX_HELD(&msp->ms_lock)); mutex_enter(&msp->ms_lock); /* - * Since we typically have hundreds of metaslabs per vdev, but we only - * trim them once every zfs_txgs_per_trim txgs, it'd be best if we - * could sequence the TRIM commands from all metaslabs so that they - * don't all always pound the device in the same txg. We do so by - * artificially inflating the birth txg of the first trim set by a - * sequence number derived from the metaslab's starting offset - * (modulo zfs_txgs_per_trim). Thus, for the default 200 metaslabs and - * 32 txgs per trim, we'll only be trimming ~6.25 metaslabs per txg. - * - * If we detect that the txg has advanced too far ahead of ts_birth, - * it means our birth txg is out of lockstep. Recompute it by - * rounding down to the nearest zfs_txgs_per_trim multiple and adding - * our metaslab id modulo zfs_txgs_per_trim. + * Always swap out the current and previous trimsets. Normally this + * should be done at intervals of zfs_txgs_per_trim. The code which + * controls this is in vdev_auto_trim. */ - if (txg > msp->ms_cur_ts->ts_birth + txgs_per_trim) { - msp->ms_cur_ts->ts_birth = (txg / txgs_per_trim) * - txgs_per_trim + (msp->ms_id % txgs_per_trim); - } - - /* Time to swap out the current and previous trimsets */ - if (txg == msp->ms_cur_ts->ts_birth + txgs_per_trim) { - if (msp->ms_prev_ts != NULL) { - if (msp->ms_trimming_ts != NULL) { - spa_t *spa = msp->ms_group->mg_class->mc_spa; - /* - * The previous trim run is still ongoing, so - * the device is reacting slowly to our trim - * requests. Drop this trimset, so as not to - * back the device up with trim requests. - */ - if (preserve_spilled) { - DTRACE_PROBE1(preserve__spilled, - metaslab_t *, msp); - range_tree_vacate( - msp->ms_prev_ts->ts_tree, - range_tree_add, - msp->ms_cur_ts->ts_tree); - } else { - DTRACE_PROBE1(drop__spilled, - metaslab_t *, msp); - spa_trimstats_auto_slow_incr(spa); - } - metaslab_free_trimset(msp->ms_prev_ts); - } else if (msp->ms_group->mg_vd->vdev_man_trimming) { - /* - * If a manual trim is ongoing, we want to - * inhibit autotrim temporarily so it doesn't - * slow down the manual trim. - */ - metaslab_free_trimset(msp->ms_prev_ts); + if (msp->ms_prev_ts != NULL) { + if (msp->ms_trimming_ts != NULL) { + spa_t *spa = msp->ms_group->mg_class->mc_spa; + /* + * The previous trim run is still ongoing, so the + * device is reacting slowly to trims. Consider + * dropping this trimset, so as not to back the + * device up. + */ + if (preserve_spilled) { + DTRACE_PROBE1(preserve__spilled, + metaslab_t *, msp); + range_tree_vacate(msp->ms_prev_ts, + range_tree_add, msp->ms_cur_ts); } else { - /* - * Trim out aged extents on the vdevs - these - * are safe to be destroyed now. We'll keep - * the trimset around to deny allocations from - * these regions while the trims are ongoing. - */ - zio_nowait(metaslab_exec_trim(msp, B_TRUE)); + DTRACE_PROBE1(drop__spilled, metaslab_t *, msp); + spa_trimstats_auto_slow_incr(spa); } + metaslab_free_trimset(msp->ms_prev_ts); + } else if (msp->ms_group->mg_vd->vdev_man_trimming) { + /* + * If a manual trim is ongoing, we want to inhibit + * autotrim temporarily so it doesn't slow down the + * manual trim. + */ + metaslab_free_trimset(msp->ms_prev_ts); + } else { + /* + * Trim out aged extents on the vdevs - these are safe + * to be destroyed now. We'll keep the trimset around + * to deny allocations from these regions while the + * trims are ongoing. + */ + zio_nowait(metaslab_exec_trim(msp, B_TRUE)); } - msp->ms_prev_ts = msp->ms_cur_ts; - msp->ms_cur_ts = metaslab_new_trimset(txg, &msp->ms_lock); } + msp->ms_prev_ts = msp->ms_cur_ts; + msp->ms_cur_ts = range_tree_create(NULL, NULL, &msp->ms_lock); + mutex_exit(&msp->ms_lock); } @@ -3756,15 +3706,15 @@ metaslab_auto_trim(metaslab_t *msp, uint64_t txg, boolean_t preserve_spilled) * get it "close enough". */ static uint64_t -metaslab_trimset_mem_used(metaslab_trimset_t *ts) +metaslab_trimset_mem_used(range_tree_t *ts) { uint64_t result = 0; - result += avl_numnodes(&ts->ts_tree->rt_root) * (sizeof (range_seg_t) + + result += avl_numnodes(&ts->rt_root) * (sizeof (range_seg_t) + sizeof (dkioc_free_list_ext_t)); - result += ((range_tree_space(ts->ts_tree) / zfs_max_bytes_per_trim) + - 1) * sizeof (zio_t); - result += sizeof (range_tree_t) + sizeof (metaslab_trimset_t); + result += ((range_tree_space(ts) / zfs_max_bytes_per_trim) + 1) * + sizeof (zio_t); + result += sizeof (range_tree_t); return (result); } @@ -3799,6 +3749,10 @@ metaslab_trim_done(zio_t *zio) held = MUTEX_HELD(&msp->ms_lock); if (!held) mutex_enter(&msp->ms_lock); + if (msp->ms_loaded) { + range_tree_walk(msp->ms_trimming_ts, range_tree_add, + msp->ms_tree); + } metaslab_free_trimset(msp->ms_trimming_ts); msp->ms_trimming_ts = NULL; cv_broadcast(&msp->ms_trim_cv); @@ -3844,24 +3798,33 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) cv_wait(&msp->ms_trim_cv, &msp->ms_lock); msp->ms_trimming_ts = msp->ms_prev_ts; msp->ms_prev_ts = NULL; - trim_tree = msp->ms_trimming_ts->ts_tree; -#ifdef DEBUG + trim_tree = msp->ms_trimming_ts; + if (msp->ms_loaded) { for (range_seg_t *rs = avl_first(&trim_tree->rt_root); rs != NULL; rs = AVL_NEXT(&trim_tree->rt_root, rs)) { +#ifdef DEBUG if (!range_tree_contains_part(msp->ms_tree, rs->rs_start, rs->rs_end - rs->rs_start)) { panic("trimming allocated region; rs=%p", (void*)rs); } +#endif /* DEBUG */ + /* + * To avoid allocating from the range of extents we're + * currently destroying, temporarily remove them from + * the tree of free space. They'll then be added back + * in in metaslab_trim_done. + */ + range_tree_remove(msp->ms_tree, rs->rs_start, + rs->rs_end - rs->rs_start); } } -#endif /* Nothing to trim */ if (range_tree_space(trim_tree) == 0) { metaslab_free_trimset(msp->ms_trimming_ts); - msp->ms_trimming_ts = 0; + msp->ms_trimming_ts = NULL; return (zio_null(NULL, spa, NULL, NULL, NULL, 0)); } @@ -3913,65 +3876,11 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) } /* - * Allocates and initializes a new trimset structure. The `txg' argument - * indicates when this trimset was born and `lock' indicates the lock to - * link to the range tree. - */ -static metaslab_trimset_t * -metaslab_new_trimset(uint64_t txg, kmutex_t *lock) -{ - metaslab_trimset_t *ts; - - ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); - ts->ts_birth = txg; - ts->ts_tree = range_tree_create(NULL, NULL, lock); - - return (ts); -} - -/* - * Destroys and frees a trim set previously allocated by metaslab_new_trimset. + * Destroys and frees a trim set. */ static void -metaslab_free_trimset(metaslab_trimset_t *ts) -{ - range_tree_vacate(ts->ts_tree, NULL, NULL); - range_tree_destroy(ts->ts_tree); - kmem_free(ts, sizeof (*ts)); -} - -/* - * Checks whether an allocation conflicts with an ongoing trim operation in - * the given metaslab. This function takes a segment starting at `*offset' - * of `size' and checks whether it hits any region in the metaslab currently - * being trimmed. If yes, it tries to adjust the allocation to the end of - * the region being trimmed (P2ROUNDUP aligned by `align'), but only up to - * `limit' (no part of the allocation is allowed to go past this point). - * - * Returns B_FALSE if either the original allocation wasn't in conflict, or - * the conflict could be resolved by adjusting the value stored in `offset' - * such that the whole allocation still fits below `limit'. Returns B_TRUE - * if the allocation conflict couldn't be resolved. - */ -static boolean_t metaslab_check_trim_conflict(metaslab_t *msp, - uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit) +metaslab_free_trimset(range_tree_t *ts) { - uint64_t new_offset; - - ASSERT3U(*offset + size, <=, limit); - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - if (msp->ms_trimming_ts == NULL) - /* no trim conflict, original offset is OK */ - return (B_FALSE); - - new_offset = P2ROUNDUP(range_tree_find_gap(msp->ms_trimming_ts->ts_tree, - *offset, size), align); - if (new_offset + size > limit) - /* trim conflict and adjustment not possible */ - return (B_TRUE); - - /* trim conflict, but adjusted offset still within limit */ - *offset = new_offset; - return (B_FALSE); + range_tree_vacate(ts, NULL, NULL); + range_tree_destroy(ts); } diff --git a/usr/src/uts/common/fs/zfs/range_tree.c b/usr/src/uts/common/fs/zfs/range_tree.c index 0ee64783db5e..059dc6d3f373 100644 --- a/usr/src/uts/common/fs/zfs/range_tree.c +++ b/usr/src/uts/common/fs/zfs/range_tree.c @@ -325,23 +325,6 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) return (NULL); } -/* - * Given an extent start offset and size, will look through the provided - * range tree and find a suitable start offset (starting at `start') such - * that the requested extent _doesn't_ overlap with any range segment in - * the range tree. - */ -uint64_t -range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size) -{ - range_seg_t *rs; - - ASSERT(MUTEX_HELD(rt->rt_lock)); - while ((rs = range_tree_find_impl(rt, start, size)) != NULL) - start = rs->rs_end; - return (start); -} - void range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) { diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index ccf0c590770f..b54be361f6ae 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -57,7 +57,7 @@ void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); -void metaslab_auto_trim(metaslab_t *, uint64_t, boolean_t); +void metaslab_auto_trim(metaslab_t *, boolean_t); uint64_t metaslab_trim_mem_used(metaslab_t *); #define METASLAB_HINTBP_FAVOR 0x0 diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index f2b7c217a643..88ef378f737d 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -247,11 +247,6 @@ struct metaslab_group { uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; -typedef struct { - uint64_t ts_birth; /* TXG at which this trimset starts */ - range_tree_t *ts_tree; /* tree of extents in the trimset */ -} metaslab_trimset_t; - /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. @@ -326,10 +321,10 @@ struct metaslab { range_tree_t *ms_alloctree[TXG_SIZE]; range_tree_t *ms_tree; - metaslab_trimset_t *ms_cur_ts; /* currently prepared trims */ - metaslab_trimset_t *ms_prev_ts; /* previous (aging) trims */ - kcondvar_t ms_trim_cv; - metaslab_trimset_t *ms_trimming_ts; + range_tree_t *ms_cur_ts; /* currently prepared trims */ + range_tree_t *ms_prev_ts; /* previous (aging) trims */ + kcondvar_t ms_trim_cv; + range_tree_t *ms_trimming_ts; /* in flight trims */ /* * The following range trees are accessed only from syncing context. diff --git a/usr/src/uts/common/fs/zfs/sys/range_tree.h b/usr/src/uts/common/fs/zfs/sys/range_tree.h index 3bf8d0a4c73c..a7f42eac245a 100644 --- a/usr/src/uts/common/fs/zfs/sys/range_tree.h +++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h @@ -81,7 +81,6 @@ void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); boolean_t range_tree_contains_part(range_tree_t *rt, uint64_t start, uint64_t size); -uint64_t range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size); uint64_t range_tree_space(range_tree_t *rt); void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index ab819bc0332c..097ef656a7cf 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -82,6 +82,34 @@ int metaslabs_per_vdev = 200; */ uint64_t zfs_trim_mem_lim_fact = 50; +/* + * How many TXG's worth of updates should be aggregated per TRIM/UNMAP + * issued to the underlying vdev. We keep two range trees of extents + * (called "trim sets") to be trimmed per metaslab, the `current' and + * the `previous' TS. New free's are added to the current TS. Then, + * once `zfs_txgs_per_trim' transactions have elapsed, the `current' + * TS becomes the `previous' TS and a new, blank TS is created to be + * the new `current', which will then start accumulating any new frees. + * Once another zfs_txgs_per_trim TXGs have passed, the previous TS's + * extents are trimmed, the TS is destroyed and the current TS again + * becomes the previous TS. + * This serves to fulfill two functions: aggregate many small frees + * into fewer larger trim operations (which should help with devices + * which do not take so kindly to them) and to allow for disaster + * recovery (extents won't get trimmed immediately, but instead only + * after passing this rather long timeout, thus preserving + * 'zfs import -F' functionality). + * The exact default value of this tunable is a tradeoff between: + * 1) Keeping the trim commands reasonably small. + * 2) Keeping the ability to rollback back for as many txgs as possible. + * 3) Waiting around too long that the user starts to get uneasy about not + * seeing any space being freed after they remove some files. + * The default value of 32 is the maximum number of uberblocks in a vdev + * label, assuming a 4k physical sector size (which seems to be the almost + * universal smallest sector size used in SSDs). + */ +unsigned int zfs_txgs_per_trim = 32; + /* * Given a vdev type, return the appropriate ops vector. */ @@ -3610,6 +3638,7 @@ vdev_auto_trim(vdev_trim_info_t *vti) vdev_t *vd = vti->vti_vdev; spa_t *spa = vd->vdev_spa; uint64_t txg = vti->vti_txg; + uint64_t txgs_per_trim = zfs_txgs_per_trim; uint64_t mlim = 0, mused = 0; boolean_t limited; @@ -3626,8 +3655,20 @@ vdev_auto_trim(vdev_trim_info_t *vti) limited = mused > mlim; DTRACE_PROBE3(autotrim__mem__lim, vdev_t *, vd, uint64_t, mused, uint64_t, mlim); - for (uint64_t i = 0; i < vd->vdev_ms_count; i++) - metaslab_auto_trim(vd->vdev_ms[i], txg, !limited); + + /* + * Since we typically have hundreds of metaslabs per vdev, but we only + * trim them once every zfs_txgs_per_trim txgs, it'd be best if we + * could sequence the TRIM commands from all metaslabs so that they + * don't all always pound the device in the same txg. We do so taking + * the txg number modulo txgs_per_trim and then skipping by + * txgs_per_trim. Thus, for the default 200 metaslabs and 32 + * txgs_per_trim, we'll only be trimming ~6.25 metaslabs per txg. + */ + for (uint64_t i = txg % txgs_per_trim; i < vd->vdev_ms_count; + i += txgs_per_trim) + metaslab_auto_trim(vd->vdev_ms[i], !limited); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); out: diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 840657eaf119..c44cf22ed182 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1061,7 +1061,7 @@ zio_trim_check(uint64_t start, uint64_t len, void *msp) mutex_enter(&ms->ms_lock); ASSERT(ms->ms_trimming_ts != NULL); if (ms->ms_loaded) - ASSERT(range_tree_contains(ms->ms_trimming_ts->ts_tree, + ASSERT(range_tree_contains(ms->ms_trimming_ts, start - VDEV_LABEL_START_SIZE, len)); if (!held) mutex_exit(&ms->ms_lock);