openzfs · allanjude · Dec 7, 2021 · problame · Mar 22, 2022 · oshogbo
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
@@ -64,6 +64,8 @@ extern int zfs_dirty_data_max_percent;
 extern int zfs_dirty_data_max_max_percent;
 extern int zfs_delay_min_dirty_percent;
 extern unsigned long zfs_delay_scale;
+extern unsigned long zfs_smoothing_scale;
+extern unsigned long zfs_smoothing_write;
 
 /* These macros are for indexing into the zfs_all_blkstats_t. */
 #define	DMU_OT_DEFERRED	DMU_OT_NONE
@@ -115,6 +117,7 @@ typedef struct dsl_pool {
 	kcondvar_t dp_spaceavail_cv;
 	uint64_t dp_dirty_pertxg[TXG_SIZE];
 	uint64_t dp_dirty_total;
+	hrtime_t dp_last_smooth;
 	uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
 	uint64_t dp_mos_used_delta;
 	uint64_t dp_mos_compressed_delta;

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
@@ -15,7 +15,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd June 1, 2021
+.Dd January 8, 2021
 .Dt ZFS 4
 .Os
 .
@@ -944,6 +944,21 @@ This will smoothly handle between ten times and a tenth of this number.
 .Pp
 .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 .
 .
+.It Sy zfs_smoothing_write Ns = Ns Sy 0 Ns s Pq ulong
+This controls for how many seconds smoothing should be applied.
+The smoothing mechanism is used to add additional transaction delays
+after the amount of dirty data drops below
+.Sy zfs_delay_min_dirty_percent .
+This mechanism may be used to avoid stalls and uneven performance during
+heavy write workloads
+.
+.It Sy zfs_smoothing_scale Ns = Ns Sy 100000 Pq int
+Similar to
+.Sy zfs_delay_scale ,
+but for write smoothing.
+This variable controls the scale of smoothing curve.
+Larger values cause longer delays for a given amount of dirty data.
+.
 .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disables requirement for IVset GUIDs to be present and match when doing a raw
 receive of encrypted datasets.

diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
@@ -776,14 +776,16 @@ static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
  * of zfs_delay_scale to increase the steepness of the curve.
  */
 static void
-dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty, hrtime_t last_smooth)
 {
 	dsl_pool_t *dp = tx->tx_pool;
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	hrtime_t wakeup, min_tx_time, now;
+	hrtime_t wakeup, min_tx_time, now, smoothing_time, delay_time;
 
-	if (dirty <= delay_min_bytes)
+	now = gethrtime();
+
+	if (dirty <= delay_min_bytes && last_smooth <= now)
 		return;
 
 	/*
@@ -794,11 +796,20 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 	 */
 	ASSERT3U(dirty, <, zfs_dirty_data_max);
 
-	now = gethrtime();
-	min_tx_time = zfs_delay_scale *
-	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
-	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
-	if (now > tx->tx_start + min_tx_time)
+	smoothing_time = 0;
+	delay_time = 0;
+
+	if (dirty > delay_min_bytes) {
+		delay_time = zfs_delay_scale *
+		    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+	}
+	if (last_smooth > now) {
+		smoothing_time = zfs_smoothing_scale * dirty /
+		    (zfs_dirty_data_max - dirty);
+	}
+
+	min_tx_time = MIN(MAX(smoothing_time, delay_time), zfs_delay_max_ns);
+	if (zfs_smoothing_write == 0 && now > tx->tx_start + min_tx_time)
 		return;
 
 	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
@@ -808,6 +819,9 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 	wakeup = MAX(tx->tx_start + min_tx_time,
 	    dp->dp_last_wakeup + min_tx_time);
 	dp->dp_last_wakeup = wakeup;
+	if (dirty > delay_min_bytes) {
+		dp->dp_last_smooth = now + zfs_smoothing_write * NANOSEC;
+	}
 	mutex_exit(&dp->dp_lock);
 
 	zfs_sleep_until(wakeup);
@@ -1069,7 +1083,7 @@ dmu_tx_wait(dmu_tx_t *tx)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
 	dsl_pool_t *dp = tx->tx_pool;
-	hrtime_t before;
+	hrtime_t before, last_smooth;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(!dsl_pool_config_held(tx->tx_pool));
@@ -1089,10 +1103,11 @@ dmu_tx_wait(dmu_tx_t *tx)
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
 		while (dp->dp_dirty_total >= zfs_dirty_data_max)
 			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+		last_smooth = dp->dp_last_smooth;
 		dirty = dp->dp_dirty_total;
 		mutex_exit(&dp->dp_lock);
 
-		dmu_tx_delay(tx, dirty);
+		dmu_tx_delay(tx, dirty, last_smooth);
 
 		tx->tx_wait_dirty = B_FALSE;
 

diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
@@ -103,6 +103,7 @@ unsigned long zfs_dirty_data_max = 0;
 unsigned long zfs_dirty_data_max_max = 0;
 int zfs_dirty_data_max_percent = 10;
 int zfs_dirty_data_max_max_percent = 25;
+unsigned long zfs_smoothing_write = 0;
 
 /*
  * zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
@@ -140,6 +141,7 @@ int zfs_delay_min_dirty_percent = 60;
  * multiply in dmu_tx_delay().
  */
 unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
+unsigned long zfs_smoothing_scale = 100000;
 
 /*
  * This determines the number of threads used by the dp_sync_taskq.
@@ -958,9 +960,10 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 
 	mutex_enter(&dp->dp_lock);
 	uint64_t dirty = dp->dp_dirty_total;
+	hrtime_t last_delay = dp->dp_last_smooth;
 	mutex_exit(&dp->dp_lock);
 
-	return (dirty > delay_min_bytes);
+	return (dirty > delay_min_bytes || last_delay > gethrtime());
 }
 
 static boolean_t
@@ -1462,6 +1465,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
 	"The size limit of write-transaction zil log data");
 
+ZFS_MODULE_PARAM(zfs, zfs_, smoothing_write, ULONG, ZMOD_RW,
+	"How long should we smooth write after last delay (sec)");
+
 /* zfs_dirty_data_max_max only applied at module load in arc_init(). */
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
 	"zfs_dirty_data_max upper bound in bytes");
@@ -1472,6 +1478,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW,
 	"How quickly delay approaches infinity");
 
+ZFS_MODULE_PARAM(zfs, zfs_, smoothing_scale, ULONG, ZMOD_RW,
+	"Delay smoothing scale");
+
 ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
 	"Max percent of CPUs that are used to sync dirty data");