Skip to content

Commit

Permalink
dmu_objset_from_ds must be called with dp_config_rwlock held
Browse files Browse the repository at this point in the history
The normal lock order is that the dp_config_rwlock must be held before
the ds_opening_lock.  For example, dmu_objset_hold() does this.
However, dmu_objset_open_impl() is called with the ds_opening_lock held,
and if the dp_config_rwlock is not already held, it will attempt to
acquire it.  This may lead to deadlock, since the lock order is
reversed.

Looking at all the callers of dmu_objset_open_impl() (which is
principally the callers of dmu_objset_from_ds()), almost all callers
already have the dp_config_rwlock.  However, there are a few places in
the send and receive code paths that do not.  For example:
dsl_crypto_populate_key_nvlist, send_cb, dmu_recv_stream,
receive_write_byref, redact_traverse_thread.

This commit resolves the problem by requiring all callers ot
dmu_objset_from_ds() to hold the dp_config_rwlock.  In most cases, the
code has been restructured such that we call dmu_objset_from_ds()
earlier on in the send and receive processes, when we already have the
dp_config_rwlock, and save the objset_t until we need it in the middle
of the send or receive (similar to what we already do with the
dsl_dataset_t).  Thus we do not need to acquire the dp_config_rwlock in
many new places.

I also cleaned up code in dmu_redact_snap() and send_traverse_thread().

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Zuchowski <pzuchowski@datto.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes openzfs#9662
Closes openzfs#10115
  • Loading branch information
ahrens committed Mar 12, 2020
1 parent fa130e0 commit 0fdd610
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 115 deletions.
2 changes: 1 addition & 1 deletion include/sys/dsl_crypt.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ void key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag);
int spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag,
dsl_crypto_key_t **dck_out);

int dsl_crypto_populate_key_nvlist(struct dsl_dataset *ds,
int dsl_crypto_populate_key_nvlist(struct objset *os,
uint64_t from_ivset_guid, nvlist_t **nvl_out);
int dsl_crypto_recv_raw_key_check(struct dsl_dataset *ds,
nvlist_t *nvl, dmu_tx_t *tx);
Expand Down
6 changes: 6 additions & 0 deletions module/os/linux/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1804,7 +1804,10 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
objset_t *os;
VERIFY3P(ds->ds_owner, ==, zfsvfs);
VERIFY(dsl_dataset_long_held(ds));
dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
dsl_pool_config_enter(dp, FTAG);
VERIFY0(dmu_objset_from_ds(ds, &os));
dsl_pool_config_exit(dp, FTAG);

err = zfsvfs_init(zfsvfs, os);
if (err != 0)
Expand Down Expand Up @@ -1895,7 +1898,10 @@ zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
objset_t *os;
VERIFY3P(ds->ds_owner, ==, zfsvfs);
VERIFY(dsl_dataset_long_held(ds));
dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
dsl_pool_config_enter(dp, FTAG);
VERIFY0(dmu_objset_from_ds(ds, &os));
dsl_pool_config_exit(dp, FTAG);
zfsvfs->z_os = os;

/* release the VOPs */
Expand Down
27 changes: 9 additions & 18 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
ASSERT(!BP_IS_REDACTED(bp));

/*
* We need the pool config lock to get properties.
*/
ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));

/*
* The $ORIGIN dataset (if it exists) doesn't have an associated
* objset, so there's no reason to open it. The $ORIGIN dataset
Expand Down Expand Up @@ -503,20 +508,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* checksum/compression/copies.
*/
if (ds != NULL) {
boolean_t needlock = B_FALSE;

os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);

/*
* Note: it's valid to open the objset if the dataset is
* long-held, in which case the pool_config lock will not
* be held.
*/
if (!dsl_pool_config_held(dmu_objset_pool(os))) {
needlock = B_TRUE;
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
}

err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
Expand Down Expand Up @@ -579,8 +572,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
smallblk_changed_cb, os);
}
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (err != 0) {
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
kmem_free(os, sizeof (objset_t));
Expand Down Expand Up @@ -650,11 +641,11 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
int err = 0;

/*
* We shouldn't be doing anything with dsl_dataset_t's unless the
* pool_config lock is held, or the dataset is long-held.
* We need the pool_config lock to manipulate the dsl_dataset_t.
* Even if the dataset is long-held, we need the pool_config lock
* to open the objset, as it needs to get properties.
*/
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
dsl_dataset_long_held(ds));
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));

mutex_enter(&ds->ds_opening_lock);
if (ds->ds_objset == NULL) {
Expand Down
41 changes: 22 additions & 19 deletions module/zfs/dmu_recv.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ struct receive_writer_arg {
typedef struct guid_map_entry {
uint64_t guid;
boolean_t raw;
dsl_dataset_t *gme_ds;
objset_t *gme_os;
avl_node_t avlnode;
} guid_map_entry_t;

Expand Down Expand Up @@ -914,6 +914,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
rrw_exit(&newds->ds_bp_rwlock, FTAG);

drba->drba_cookie->drc_ds = newds;
drba->drba_cookie->drc_os = os;

spa_history_log_internal_ds(newds, "receive", tx, " ");
}
Expand Down Expand Up @@ -1095,6 +1096,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
rrw_exit(&ds->ds_bp_rwlock, FTAG);

drba->drba_cookie->drc_ds = ds;
VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os));
drba->drba_cookie->drc_should_save = B_TRUE;

spa_history_log_internal_ds(ds, "resume receive", tx, " ");
Expand Down Expand Up @@ -1227,11 +1229,12 @@ free_guid_map_onexit(void *arg)
ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT;

if (gmep->raw) {
gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE;
gmep->gme_os->os_raw_receive = B_FALSE;
dsflags &= ~DS_HOLD_FLAG_DECRYPT;
}

dsl_dataset_disown(gmep->gme_ds, dsflags, gmep);
dsl_dataset_disown(gmep->gme_os->os_dsl_dataset,
dsflags, gmep);
kmem_free(gmep, sizeof (guid_map_entry_t));
}
avl_destroy(ca);
Expand Down Expand Up @@ -1797,8 +1800,7 @@ receive_write_byref(struct receive_writer_arg *rwa,
&where)) == NULL) {
return (SET_ERROR(EINVAL));
}
if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
return (SET_ERROR(EINVAL));
ref_os = gmep->gme_os;
} else {
ref_os = rwa->os;
}
Expand Down Expand Up @@ -2661,10 +2663,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd,
DMU_SUBSTREAM);
ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);

/*
* Open the objset we are modifying.
*/
VERIFY0(dmu_objset_from_ds(drc->drc_ds, &drc->drc_os));
ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
ASSERT0(drc->drc_os->os_encrypted &&
(drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
Expand Down Expand Up @@ -3013,6 +3011,12 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)

dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
origin_head, tx);
/*
* The objset was evicted by dsl_dataset_clone_swap_sync_impl,
* so drc_os is no longer valid.
*/
drc->drc_os = NULL;

dsl_dataset_snapshot_sync_impl(origin_head,
drc->drc_tosnap, tx);

Expand Down Expand Up @@ -3127,26 +3131,25 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj,
err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds);

if (err == 0) {
err = dmu_objset_from_ds(snapds, &os);
if (err != 0) {
dsl_dataset_disown(snapds, dsflags, FTAG);
dsl_pool_rele(dp, FTAG);
kmem_free(gmep, sizeof (*gmep));
return (err);
}
/*
* If this is a deduplicated raw send stream, we need
* to make sure that we can still read raw blocks from
* earlier datasets in the stream, so we set the
* os_raw_receive flag now.
*/
if (raw) {
err = dmu_objset_from_ds(snapds, &os);
if (err != 0) {
dsl_dataset_disown(snapds, dsflags, FTAG);
dsl_pool_rele(dp, FTAG);
kmem_free(gmep, sizeof (*gmep));
return (err);
}
if (raw)
os->os_raw_receive = B_TRUE;
}

gmep->raw = raw;
gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
gmep->gme_ds = snapds;
gmep->gme_os = os;
avl_add(guid_map, gmep);
} else {
kmem_free(gmep, sizeof (*gmep));
Expand Down
94 changes: 50 additions & 44 deletions module/zfs/dmu_redact.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ struct redact_record {

struct redact_thread_arg {
bqueue_t q;
objset_t *os; /* Objset to traverse */
dsl_dataset_t *ds; /* Dataset to traverse */
struct redact_record *current_record;
int error_code;
Expand Down Expand Up @@ -355,11 +356,9 @@ redact_traverse_thread(void *arg)
struct redact_thread_arg *rt_arg = arg;
int err;
struct redact_record *data;
objset_t *os;
VERIFY0(dmu_objset_from_ds(rt_arg->ds, &os));
#ifdef _KERNEL
if (os->os_phys->os_type == DMU_OST_ZFS)
rt_arg->deleted_objs = zfs_get_deleteq(os);
if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS)
rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os);
else
rt_arg->deleted_objs = objlist_create();
#else
Expand Down Expand Up @@ -1004,9 +1003,8 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
int err = 0;
dsl_pool_t *dp = NULL;
dsl_dataset_t *ds = NULL;
objset_t *os;
int numsnaps = 0;
dsl_dataset_t **redactsnaparr = NULL;
objset_t *os;
struct redact_thread_arg *args = NULL;
redaction_list_t *new_rl = NULL;

Expand All @@ -1026,39 +1024,45 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
err = EALREADY;
goto out;
}
nvpair_t *pair;

if (fnvlist_num_pairs(redactnvl) > 0 && err == 0) {
redactsnaparr = kmem_zalloc(fnvlist_num_pairs(redactnvl) *
sizeof (dsl_dataset_t *), KM_SLEEP);
}
for (pair = nvlist_next_nvpair(redactnvl, NULL); err == 0 &&
pair != NULL; pair = nvlist_next_nvpair(redactnvl, pair)) {
numsnaps = fnvlist_num_pairs(redactnvl);
if (numsnaps > 0)
args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);

nvpair_t *pair = NULL;
for (int i = 0; i < numsnaps; i++) {
pair = nvlist_next_nvpair(redactnvl, pair);
const char *name = nvpair_name(pair);
struct redact_thread_arg *rta = &args[i];
err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT,
FTAG, redactsnaparr + numsnaps);
FTAG, &rta->ds);
if (err != 0)
break;
/*
* We want to do the long hold before we can get any other
* errors, because the cleanup code will release the long
* hold if rta->ds is filled in.
*/
dsl_dataset_long_hold(rta->ds, FTAG);

err = dmu_objset_from_ds(rta->ds, &rta->os);
if (err != 0)
break;
dsl_dataset_long_hold(redactsnaparr[numsnaps], FTAG);
if (!dsl_dataset_is_before(redactsnaparr[numsnaps], ds, 0)) {
if (!dsl_dataset_is_before(rta->ds, ds, 0)) {
err = EINVAL;
numsnaps++;
break;
}
if (dsl_dataset_feature_is_active(redactsnaparr[numsnaps],
if (dsl_dataset_feature_is_active(rta->ds,
SPA_FEATURE_REDACTED_DATASETS)) {
err = EALREADY;
numsnaps++;
break;

}
numsnaps++;
}
VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL);
if (err != 0)
goto out;

ASSERT3U(fnvlist_num_pairs(redactnvl), ==, numsnaps);

boolean_t resuming = B_FALSE;
char newredactbook[ZFS_MAX_DATASET_NAME_LEN];
zfs_bookmark_phys_t bookmark;
Expand Down Expand Up @@ -1091,15 +1095,14 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
goto out;
}
for (int i = 0; i < numsnaps; i++) {
struct redact_thread_arg *rta = &args[i];
if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps,
new_rl->rl_phys->rlp_num_snaps,
dsl_dataset_phys(redactsnaparr[i])->ds_guid)) {
dsl_dataset_phys(rta->ds)->ds_guid)) {
err = ESRCH;
goto out;
}
}
if (numsnaps > 0)
args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
new_rl->rl_phys->rlp_last_object == UINT64_MAX) {
err = EEXIST;
Expand All @@ -1112,10 +1115,11 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
if (numsnaps > 0) {
guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
KM_SLEEP);
args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
}
for (int i = 0; i < numsnaps; i++)
guids[i] = dsl_dataset_phys(redactsnaparr[i])->ds_guid;
for (int i = 0; i < numsnaps; i++) {
struct redact_thread_arg *rta = &args[i];
guids[i] = dsl_dataset_phys(rta->ds)->ds_guid;
}

dsl_pool_rele(dp, FTAG);
dp = NULL;
Expand All @@ -1128,18 +1132,18 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
}

for (int i = 0; i < numsnaps; i++) {
args[i].ds = redactsnaparr[i];
(void) bqueue_init(&args[i].q, zfs_redact_queue_ff,
struct redact_thread_arg *rta = &args[i];
(void) bqueue_init(&rta->q, zfs_redact_queue_ff,
zfs_redact_queue_length,
offsetof(struct redact_record, ln));
if (resuming) {
args[i].resume.zb_blkid =
rta->resume.zb_blkid =
new_rl->rl_phys->rlp_last_blkid;
args[i].resume.zb_object =
rta->resume.zb_object =
new_rl->rl_phys->rlp_last_object;
}
args[i].txg = dsl_dataset_phys(ds)->ds_creation_txg;
(void) thread_create(NULL, 0, redact_traverse_thread, &args[i],
rta->txg = dsl_dataset_phys(ds)->ds_creation_txg;
(void) thread_create(NULL, 0, redact_traverse_thread, rta,
0, curproc, TS_RUN, minclsyspri);
}
struct redact_merge_thread_arg rmta = { { {0} } };
Expand All @@ -1152,23 +1156,25 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
TS_RUN, minclsyspri);
err = perform_redaction(os, new_rl, &rmta);
out:
if (args != NULL) {
kmem_free(args, numsnaps * sizeof (*args));
}
if (new_rl != NULL) {
dsl_redaction_list_long_rele(new_rl, FTAG);
dsl_redaction_list_rele(new_rl, FTAG);
}
for (int i = 0; i < numsnaps; i++) {
dsl_dataset_long_rele(redactsnaparr[i], FTAG);
dsl_dataset_rele_flags(redactsnaparr[i], DS_HOLD_FLAG_DECRYPT,
FTAG);
struct redact_thread_arg *rta = &args[i];
/*
* rta->ds may be NULL if we got an error while filling
* it in.
*/
if (rta->ds != NULL) {
dsl_dataset_long_rele(rta->ds, FTAG);
dsl_dataset_rele_flags(rta->ds,
DS_HOLD_FLAG_DECRYPT, FTAG);
}
}

if (redactsnaparr != NULL) {
kmem_free(redactsnaparr, fnvlist_num_pairs(redactnvl) *
sizeof (dsl_dataset_t *));
}
if (args != NULL)
kmem_free(args, numsnaps * sizeof (*args));
if (dp != NULL)
dsl_pool_rele(dp, FTAG);
if (ds != NULL) {
Expand Down
Loading

0 comments on commit 0fdd610

Please sign in to comment.