diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 44900f8ceb2f..4080017ee067 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -65,6 +65,8 @@ extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; extern int zfs_delay_min_dirty_percent; extern unsigned long zfs_delay_scale; +extern unsigned long zfs_smoothing_scale; +extern unsigned long zfs_smoothing_write; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE @@ -116,6 +118,7 @@ typedef struct dsl_pool { kcondvar_t dp_spaceavail_cv; uint64_t dp_dirty_pertxg[TXG_SIZE]; uint64_t dp_dirty_total; + hrtime_t dp_last_smooth; uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; uint64_t dp_mos_used_delta; uint64_t dp_mos_compressed_delta; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index d7fc31bfde10..0ce7fa0750fe 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -15,7 +15,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd June 1, 2021 +.Dd January 8, 2021 .Dt ZFS 4 .Os . @@ -928,6 +928,21 @@ This will smoothly handle between ten times and a tenth of this number. .Pp .Sy zfs_delay_scale * zfs_dirty_data_max Em must be smaller than Sy 2^64 . . +.It Sy zfs_smoothing_write Ns = Ns Sy 0 Ns s Pq ulong +This controls for how many seconds smoothing should be applied. +The smoothing mechanism is used to add additional transaction delays +after the amount of dirty data drops below +.Sy zfs_delay_min_dirty_percent . +This mechanism may be used to avoid stalls and uneven performance during +heavy write workloads +. +.It Sy zfs_smoothing_scale Ns = Ns Sy 100000 Pq int +Similar to +.Sy zfs_delay_scale , +but for write smoothing. +This variable controls the scale of smoothing curve. +Larger values cause longer delays for a given amount of dirty data. +. .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5fa516866668..010db099bd45 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -778,14 +778,16 @@ int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ * of zfs_delay_scale to increase the steepness of the curve. */ static void -dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty, hrtime_t last_smooth) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + hrtime_t wakeup, min_tx_time, now, smoothing_time, delay_time; - if (dirty <= delay_min_bytes) + now = gethrtime(); + + if (dirty <= delay_min_bytes && last_smooth <= now) return; /* @@ -796,11 +798,20 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) */ ASSERT3U(dirty, <, zfs_dirty_data_max); - now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + smoothing_time = 0; + delay_time = 0; + + if (dirty > delay_min_bytes) { + delay_time = zfs_delay_scale * + (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + } + if (last_smooth > now) { + smoothing_time = zfs_smoothing_scale * dirty / + (zfs_dirty_data_max - dirty); + } + + min_tx_time = MIN(MAX(smoothing_time, delay_time), zfs_delay_max_ns); + if (zfs_smoothing_write == 0 && now > tx->tx_start + min_tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, @@ -810,6 +821,9 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) wakeup = MAX(tx->tx_start + min_tx_time, dp->dp_last_wakeup + min_tx_time); dp->dp_last_wakeup = wakeup; + if (dirty > delay_min_bytes) { + dp->dp_last_smooth = now + zfs_smoothing_write * NANOSEC; + } mutex_exit(&dp->dp_lock); zfs_sleep_until(wakeup); @@ -1071,7 +1085,7 @@ dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; - hrtime_t before; + hrtime_t before, last_smooth; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); @@ -1091,10 +1105,11 @@ dmu_tx_wait(dmu_tx_t *tx) DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); + last_smooth = dp->dp_last_smooth; dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - dmu_tx_delay(tx, dirty); + dmu_tx_delay(tx, dirty, last_smooth); tx->tx_wait_dirty = B_FALSE; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 1350f1329564..113c12846a46 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -103,6 +103,7 @@ unsigned long zfs_dirty_data_max = 0; unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +unsigned long zfs_smoothing_write = 0; /* * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. @@ -140,6 +141,7 @@ int zfs_delay_min_dirty_percent = 60; * multiply in dmu_tx_delay(). */ unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; +unsigned long zfs_smoothing_scale = 100000; /* * This determines the number of threads used by the dp_sync_taskq. @@ -955,9 +957,10 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) mutex_enter(&dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; + hrtime_t last_delay = dp->dp_last_smooth; mutex_exit(&dp->dp_lock); - return (dirty > delay_min_bytes); + return (dirty > delay_min_bytes || last_delay > gethrtime()); } static boolean_t @@ -1462,6 +1465,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, "The size limit of write-transaction zil log data"); +ZFS_MODULE_PARAM(zfs, zfs_, smoothing_write, ULONG, ZMOD_RW, + "How long should we smooth write after last delay (sec)"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); @@ -1472,6 +1478,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, "How quickly delay approaches infinity"); +ZFS_MODULE_PARAM(zfs, zfs_, smoothing_scale, ULONG, ZMOD_RW, + "Delay smoothing scale"); + ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, "Max percent of CPUs that are used to sync dirty data");