Skip to content

Commit

Permalink
Introduce write throttle smoothing
Browse files Browse the repository at this point in the history
To avoid stalls and uneven performance during heavy write workloads,
continue to apply write throttling even when the amount of dirty data
has temporarily dipped below zfs_delay_min_dirty_percent for up to
zfs_write_smoothing seconds.

Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
  • Loading branch information
oshogbo committed Jan 8, 2022
1 parent ce2bdce commit 98e009a
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 12 deletions.
3 changes: 3 additions & 0 deletions include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ extern int zfs_dirty_data_max_percent;
extern int zfs_dirty_data_max_max_percent;
extern int zfs_delay_min_dirty_percent;
extern unsigned long zfs_delay_scale;
extern unsigned long zfs_smoothing_scale;
extern unsigned long zfs_smoothing_write;

/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
Expand Down Expand Up @@ -116,6 +118,7 @@ typedef struct dsl_pool {
kcondvar_t dp_spaceavail_cv;
uint64_t dp_dirty_pertxg[TXG_SIZE];
uint64_t dp_dirty_total;
hrtime_t dp_last_smooth;
uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
uint64_t dp_mos_used_delta;
uint64_t dp_mos_compressed_delta;
Expand Down
17 changes: 16 additions & 1 deletion man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd June 1, 2021
.Dd January 8, 2021
.Dt ZFS 4
.Os
.
Expand Down Expand Up @@ -928,6 +928,21 @@ This will smoothly handle between ten times and a tenth of this number.
.Pp
.Sy zfs_delay_scale * zfs_dirty_data_max Em must be smaller than Sy 2^64 .
.
.It Sy zfs_smoothing_write Ns = Ns Sy 0 Ns s Pq ulong
This controls for how many seconds smoothing should be applied.
The smoothing mechanism is used to add additional transaction delays
after the amount of dirty data drops below
.Sy zfs_delay_min_dirty_percent .
This mechanism may be used to avoid stalls and uneven performance during
heavy write workloads
.
.It Sy zfs_smoothing_scale Ns = Ns Sy 100000 Pq int
Similar to
.Sy zfs_delay_scale ,
but for write smoothing.
This variable controls the scale of smoothing curve.
Larger values cause longer delays for a given amount of dirty data.
.
.It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int
Disables requirement for IVset GUIDs to be present and match when doing a raw
receive of encrypted datasets.
Expand Down
35 changes: 25 additions & 10 deletions module/zfs/dmu_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -778,14 +778,16 @@ int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
* of zfs_delay_scale to increase the steepness of the curve.
*/
static void
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty, hrtime_t last_smooth)
{
dsl_pool_t *dp = tx->tx_pool;
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
hrtime_t wakeup, min_tx_time, now;
hrtime_t wakeup, min_tx_time, now, smoothing_time, delay_time;

if (dirty <= delay_min_bytes)
now = gethrtime();

if (dirty <= delay_min_bytes && last_smooth <= now)
return;

/*
Expand All @@ -796,11 +798,20 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
*/
ASSERT3U(dirty, <, zfs_dirty_data_max);

now = gethrtime();
min_tx_time = zfs_delay_scale *
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
if (now > tx->tx_start + min_tx_time)
smoothing_time = 0;
delay_time = 0;

if (dirty > delay_min_bytes) {
delay_time = zfs_delay_scale *
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
}
if (last_smooth > now) {
smoothing_time = zfs_smoothing_scale * dirty /
(zfs_dirty_data_max - dirty);
}

min_tx_time = MIN(MAX(smoothing_time, delay_time), zfs_delay_max_ns);
if (zfs_smoothing_write == 0 && now > tx->tx_start + min_tx_time)
return;

DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
Expand All @@ -810,6 +821,9 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
wakeup = MAX(tx->tx_start + min_tx_time,
dp->dp_last_wakeup + min_tx_time);
dp->dp_last_wakeup = wakeup;
if (dirty > delay_min_bytes) {
dp->dp_last_smooth = now + zfs_smoothing_write * NANOSEC;
}
mutex_exit(&dp->dp_lock);

zfs_sleep_until(wakeup);
Expand Down Expand Up @@ -1071,7 +1085,7 @@ dmu_tx_wait(dmu_tx_t *tx)
{
spa_t *spa = tx->tx_pool->dp_spa;
dsl_pool_t *dp = tx->tx_pool;
hrtime_t before;
hrtime_t before, last_smooth;

ASSERT(tx->tx_txg == 0);
ASSERT(!dsl_pool_config_held(tx->tx_pool));
Expand All @@ -1091,10 +1105,11 @@ dmu_tx_wait(dmu_tx_t *tx)
DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
while (dp->dp_dirty_total >= zfs_dirty_data_max)
cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
last_smooth = dp->dp_last_smooth;
dirty = dp->dp_dirty_total;
mutex_exit(&dp->dp_lock);

dmu_tx_delay(tx, dirty);
dmu_tx_delay(tx, dirty, last_smooth);

tx->tx_wait_dirty = B_FALSE;

Expand Down
11 changes: 10 additions & 1 deletion module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ unsigned long zfs_dirty_data_max = 0;
unsigned long zfs_dirty_data_max_max = 0;
int zfs_dirty_data_max_percent = 10;
int zfs_dirty_data_max_max_percent = 25;
unsigned long zfs_smoothing_write = 0;

/*
* zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
Expand Down Expand Up @@ -140,6 +141,7 @@ int zfs_delay_min_dirty_percent = 60;
* multiply in dmu_tx_delay().
*/
unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
unsigned long zfs_smoothing_scale = 100000;

/*
* This determines the number of threads used by the dp_sync_taskq.
Expand Down Expand Up @@ -955,9 +957,10 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)

mutex_enter(&dp->dp_lock);
uint64_t dirty = dp->dp_dirty_total;
hrtime_t last_delay = dp->dp_last_smooth;
mutex_exit(&dp->dp_lock);

return (dirty > delay_min_bytes);
return (dirty > delay_min_bytes || last_delay > gethrtime());
}

static boolean_t
Expand Down Expand Up @@ -1462,6 +1465,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
"The size limit of write-transaction zil log data");

ZFS_MODULE_PARAM(zfs, zfs_, smoothing_write, ULONG, ZMOD_RW,
"How long should we smooth write after last delay (sec)");

/* zfs_dirty_data_max_max only applied at module load in arc_init(). */
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
"zfs_dirty_data_max upper bound in bytes");
Expand All @@ -1472,6 +1478,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW,
"How quickly delay approaches infinity");

ZFS_MODULE_PARAM(zfs, zfs_, smoothing_scale, ULONG, ZMOD_RW,
"Delay smoothing scale");

ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
"Max percent of CPUs that are used to sync dirty data");

Expand Down

0 comments on commit 98e009a

Please sign in to comment.