Skip to content

Commit

Permalink
ddt: block scan until log is flushed, and flush aggressively
Browse files Browse the repository at this point in the history
The dedup log does not have a stable cursor, so its not possible to
persist our current scan location within it across pool reloads.
Beccause of this, when walking (scanning), we can't treat it like just
another source of dedup entries.

Instead, when a scan is wanted, we switch to an aggressive flushing
mode, pushing out entries older than the scan start txg as fast as we
can, before starting the scan proper.

Entries after the scan start txg will be handled via other methods; the
DDT ZAPs and logs will be written as normal, and blocks not seen yet
will be offered to the scan machinery as normal.

Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
  • Loading branch information
robn and allanjude committed May 15, 2024
1 parent 2004199 commit 4479c53
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 6 deletions.
5 changes: 5 additions & 0 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ typedef struct {
int32_t ddt_log_flush_rate; /* rolling log flush rate */
int32_t ddt_log_flush_time_rate; /* avg time spent flushing */

uint64_t ddt_flush_force_txg; /* flush hard before this txg */

enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */
Expand Down Expand Up @@ -346,6 +348,9 @@ extern void ddt_create(spa_t *spa);
extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg);

extern void ddt_walk_init(spa_t *spa, uint64_t txg);
extern boolean_t ddt_walk_ready(spa_t *spa);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
ddt_lightweight_entry_t *ddlwe);

Expand Down
68 changes: 68 additions & 0 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@
* position on the object even if the object changes, the pool is exported, or
* OpenZFS is upgraded.
*
* If the "fast_dedup" feature is enabled and the table has a log, the scan
* cannot begin until entries on the log are flushed, as the on-disk log has no
* concept of a "stable position". Instead, the log flushing process will enter
* a more aggressive mode, to flush out as much as is necesary as soon as
* possible, in order to begin the scan as soon as possible.
*
* ## Interaction with block cloning
*
* If block cloning and dedup are both enabled on a pool, BRT will look for the
Expand Down Expand Up @@ -1639,6 +1645,16 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
ddt->ddt_flush_min = MAX(
ddt->ddt_log_ingest_rate,
zfs_dedup_log_flush_entries_min);

/*
* If we've been asked to flush everything in a hurry,
* try to dump as much as possible on this txg. In
* this case we're only limited by time, not amount.
*/
if (ddt->ddt_flush_force_txg > 0)
ddt->ddt_flush_min =
MAX(ddt->ddt_flush_min, avl_numnodes(
&ddt->ddt_log_flushing->ddl_tree));
} else {
/* We already decided we're done for this txg */
return (B_FALSE);
Expand Down Expand Up @@ -1774,6 +1790,12 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
(void) ddt_log_swap(ddt, tx);
}

if (ddt->ddt_flush_force_txg > 0 &&
avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
/* Both logs are empty, so no more force flush */
ddt->ddt_flush_force_txg = 0;

/*
* Update flush rate. This is an exponential weighted moving average of
* the number of entries flushed over recent txgs.
Expand Down Expand Up @@ -1943,6 +1965,48 @@ ddt_sync(spa_t *spa, uint64_t txg)
dmu_tx_commit(tx);
}

void
ddt_walk_init(spa_t *spa, uint64_t txg)
{
if (txg == 0)
txg = spa_syncing_txg(spa);

for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
continue;

ddt_enter(ddt);

/*
* If either of the logs have live entries from the past,
* then we need to get flushing.
*/
if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
ddt->ddt_log_active->ddl_first_txg <= txg) ||
(!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
ddt->ddt_log_flushing->ddl_first_txg <= txg))
ddt->ddt_flush_force_txg = txg;

ddt_exit(ddt);
}
}

boolean_t
ddt_walk_ready(spa_t *spa)
{
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
continue;

if (ddt->ddt_flush_force_txg > 0)
return (B_FALSE);
}

return (B_TRUE);
}

int
ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
{
Expand All @@ -1952,6 +2016,10 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
if (ddt == NULL)
continue;

if (ddt->ddt_flush_force_txg > 0)
return (EAGAIN);

int error = ENOENT;
if (ddt_object_exists(ddt, ddb->ddb_type,
ddb->ddb_class)) {
Expand Down
8 changes: 6 additions & 2 deletions module/zfs/ddt_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,8 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
/*
* Swap policy. We swap the logs (and so begin flushing) when the
* active tree grows too large, or when we haven't swapped it in
* some amount of time.
* some amount of time, or if something has requested the logs be
* flushed ASAP (see ddt_walk_init()).
*/

/*
Expand All @@ -457,7 +458,10 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
(ddt->ddt_log_active->ddl_first_txg +
MAX(1, zfs_dedup_log_txg_max));

if (!(too_large || too_old))
const boolean_t force =
ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;

if (!(too_large || too_old || force))
return (B_FALSE);

ddt_log_t *swap = ddt->ddt_log_active;
Expand Down
25 changes: 21 additions & 4 deletions module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
zap_cursor_fini(&zc);
}

ddt_walk_init(spa, scn->scn_phys.scn_min_txg);

spa_scan_stat_init(spa);
vdev_scan_stat_init(spa->spa_root_vdev);

Expand Down Expand Up @@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)

memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));

ddt_walk_init(spa, scn->scn_phys.scn_min_txg);

dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);

spa_history_log_internal(spa, "scan setup", tx,
Expand Down Expand Up @@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
txg_sync_waiting(scn->scn_dp) ||
NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
spa_shutting_down(scn->scn_dp->dp_spa) ||
(zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
(zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
!ddt_walk_ready(scn->scn_dp->dp_spa)) {
if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
dprintf("suspending at first available bookmark "
"%llx/%llx/%llx/%llx\n",
Expand Down Expand Up @@ -3029,9 +3034,21 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
break;
}

zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
"suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
if (error == EAGAIN) {
dsl_scan_check_suspend(scn, NULL);
error = 0;

zfs_dbgmsg("waiting for ddt to become ready for scan "
"on %s with class_max = %u; suspending=%u",
scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max,
(int)scn->scn_suspending);
} else
zfs_dbgmsg("scanned %llu ddt entries on %s with "
"class_max = %u; suspending=%u", (longlong_t)n,
scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max,
(int)scn->scn_suspending);

ASSERT(error == 0 || error == ENOENT);
ASSERT(error != ENOENT ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ while (( i < 16384 )); do
done
((i += 1))
done

# Force the DDT logs to disk with a scrub so they can be prefetched
log_must zpool scrub -w $TESTPOOL

log_note "Dataset generation completed."

typeset -A generated
Expand Down

0 comments on commit 4479c53

Please sign in to comment.