Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zvol: ensure device minors are properly cleaned up #16364

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions include/sys/zvol_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,19 @@
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2024, Klara, Inc.
*/

#ifndef _SYS_ZVOL_IMPL_H
#define _SYS_ZVOL_IMPL_H

#include <sys/zfs_context.h>

#define ZVOL_RDONLY 0x1
/*
* Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which
* specifies whether or not the zvol _can_ be written to)
*/
#define ZVOL_WRITTEN_TO 0x2

#define ZVOL_DUMPIFIED 0x4

#define ZVOL_EXCL 0x8
#define ZVOL_RDONLY (1<<0) /* zvol is readonly (writes rejected) */
#define ZVOL_WRITTEN_TO (1<<1) /* zvol has been written to (needs flush) */
#define ZVOL_EXCL (1<<2) /* zvol has O_EXCL client right now */
#define ZVOL_REMOVING (1<<3) /* zvol waiting to remove minor */

/*
* The in-core state of each volume.
Expand All @@ -57,6 +54,7 @@ typedef struct zvol_state {
kmutex_t zv_state_lock; /* protects zvol_state_t */
atomic_t zv_suspend_ref; /* refcount for suspend */
krwlock_t zv_suspend_lock; /* suspend lock */
kcondvar_t zv_removing_cv; /* ready to remove minor */
struct zvol_state_os *zv_zso; /* private platform state */
boolean_t zv_threading; /* volthreading property */
} zvol_state_t;
Expand Down
10 changes: 9 additions & 1 deletion module/os/freebsd/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2024, Klara, Inc.
*/

/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
Expand Down Expand Up @@ -250,7 +251,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
}

mutex_enter(&zv->zv_state_lock);
if (zv->zv_zso->zso_dying) {
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
goto out_zv_locked;
Expand Down Expand Up @@ -683,6 +684,11 @@ zvol_geom_bio_strategy(struct bio *bp)

rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);

if (zv->zv_flags & ZVOL_REMOVING) {
error = SET_ERROR(ENXIO);
goto resume;
}

switch (bp->bio_cmd) {
case BIO_READ:
doread = B_TRUE;
Expand Down Expand Up @@ -1362,6 +1368,7 @@ zvol_os_free(zvol_state_t *zv)
}

mutex_destroy(&zv->zv_state_lock);
cv_destroy(&zv->zv_removing_cv);
dataset_kstats_destroy(&zv->zv_kstat);
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
Expand Down Expand Up @@ -1419,6 +1426,7 @@ zvol_os_create_minor(const char *name)
zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
zv->zv_hash = hash;
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_volmode = volmode;
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
Expand Down
23 changes: 19 additions & 4 deletions module/os/linux/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2024, Klara, Inc.
*/

#include <sys/dataset_kstats.h>
Expand Down Expand Up @@ -526,6 +527,11 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
uint64_t size = io_size(bio, rq);
int rw = io_data_dir(bio, rq);

if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
END_IO(zv, bio, rq, -SET_ERROR(ENXIO));
goto out;
}

if (zvol_request_sync || zv->zv_threading == B_FALSE)
force_sync = 1;

Expand Down Expand Up @@ -730,10 +736,17 @@ zvol_open(struct block_device *bdev, fmode_t flag)
#endif
if (zv == NULL) {
rw_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
return (-SET_ERROR(ENXIO));
}

mutex_enter(&zv->zv_state_lock);

if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}

tonyhutter marked this conversation as resolved.
Show resolved Hide resolved
/*
* Make sure zvol is not suspended during first open
* (hold zv_suspend_lock) and respect proper lock acquisition
Expand Down Expand Up @@ -795,10 +808,10 @@ zvol_open(struct block_device *bdev, fmode_t flag)

#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
schedule();
return (SET_ERROR(-ERESTARTSYS));
return (-SET_ERROR(ERESTARTSYS));
#else
if ((gethrtime() - start) > timeout)
return (SET_ERROR(-ERESTARTSYS));
return (-SET_ERROR(ERESTARTSYS));

schedule_timeout_interruptible(
MSEC_TO_TICK(10));
Expand All @@ -821,7 +834,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
if (zv->zv_open_count == 0)
zvol_last_close(zv);

error = SET_ERROR(-EROFS);
error = -SET_ERROR(EROFS);
} else {
zv->zv_open_count++;
}
Expand Down Expand Up @@ -1313,6 +1326,7 @@ zvol_alloc(dev_t dev, const char *name)

list_link_init(&zv->zv_next);
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);

#ifdef HAVE_BLK_MQ
zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
Expand Down Expand Up @@ -1438,6 +1452,7 @@ zvol_os_free(zvol_state_t *zv)
ida_simple_remove(&zvol_ida,
MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);

cv_destroy(&zv->zv_removing_cv);
mutex_destroy(&zv->zv_state_lock);
dataset_kstats_destroy(&zv->zv_kstat);

Expand Down
116 changes: 92 additions & 24 deletions module/zfs/zvol.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2024, Klara, Inc.
*/

/*
Expand Down Expand Up @@ -894,6 +895,9 @@ zvol_resume(zvol_state_t *zv)
*/
atomic_dec(&zv->zv_suspend_ref);

if (zv->zv_flags & ZVOL_REMOVING)
cv_broadcast(&zv->zv_removing_cv);

return (SET_ERROR(error));
}

Expand Down Expand Up @@ -929,6 +933,9 @@ zvol_last_close(zvol_state_t *zv)
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));

if (zv->zv_flags & ZVOL_REMOVING)
cv_broadcast(&zv->zv_removing_cv);

zvol_shutdown_zv(zv);

dmu_objset_disown(zv->zv_objset, 1, zv);
Expand Down Expand Up @@ -1221,6 +1228,41 @@ zvol_create_minor(const char *name)
* Remove minors for specified dataset including children and snapshots.
*/

/*
* Remove the minor for a given zvol. This will do it all:
* - flag the zvol for removal, so new requests are rejected
* - wait until outstanding requests are completed
* - remove it from lists
* - free it
* It's also usable as a taskq task, and smells nice too.
*/
static void
zvol_remove_minor_task(void *arg)
{
zvol_state_t *zv = (zvol_state_t *)arg;

ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));

mutex_enter(&zv->zv_state_lock);
while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
zv->zv_flags |= ZVOL_REMOVING;
cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
}
mutex_exit(&zv->zv_state_lock);

rw_enter(&zvol_state_lock, RW_WRITER);
mutex_enter(&zv->zv_state_lock);

zvol_remove(zv);
zvol_os_clear_private(zv);

mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);

zvol_os_free(zv);
}

static void
zvol_free_task(void *arg)
{
Expand All @@ -1233,11 +1275,13 @@ zvol_remove_minors_impl(const char *name)
zvol_state_t *zv, *zv_next;
int namelen = ((name) ? strlen(name) : 0);
taskqid_t t;
list_t free_list;
list_t delay_list, free_list;

if (zvol_inhibit_dev)
return;

list_create(&delay_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
list_create(&free_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));

Expand All @@ -1256,9 +1300,24 @@ zvol_remove_minors_impl(const char *name)
* one is currently using this zv
*/

/* If in use, leave alone */
/*
* If in use, try to throw everyone off and try again
* later.
*/
if (zv->zv_open_count > 0 ||
atomic_read(&zv->zv_suspend_ref)) {
zv->zv_flags |= ZVOL_REMOVING;
t = taskq_dispatch(
zv->zv_objset->os_spa->spa_zvol_taskq,
zvol_remove_minor_task, zv, TQ_SLEEP);
if (t == TASKQID_INVALID) {
/*
* Couldn't create the task, so we'll
* do it in place once the loop is
* finished.
*/
list_insert_head(&delay_list, zv);
}
mutex_exit(&zv->zv_state_lock);
tonyhutter marked this conversation as resolved.
Show resolved Hide resolved
continue;
}
Expand All @@ -1285,7 +1344,11 @@ zvol_remove_minors_impl(const char *name)
}
rw_exit(&zvol_state_lock);

/* Drop zvol_state_lock before calling zvol_free() */
/* Wait for zvols that we couldn't create a remove task for */
while ((zv = list_remove_head(&delay_list)) != NULL)
zvol_remove_minor_task(zv);

/* Free any that we couldn't free in parallel earlier */
while ((zv = list_remove_head(&free_list)) != NULL)
zvol_os_free(zv);
}
Expand All @@ -1305,33 +1368,38 @@ zvol_remove_minor_impl(const char *name)
zv_next = list_next(&zvol_state_list, zv);

mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, name) == 0) {
/*
* By holding zv_state_lock here, we guarantee that no
* one is currently using this zv
*/
if (strcmp(zv->zv_name, name) == 0)
/* Found, leave the the loop with zv_lock held */
break;
mutex_exit(&zv->zv_state_lock);
}

/* If in use, leave alone */
if (zv->zv_open_count > 0 ||
atomic_read(&zv->zv_suspend_ref)) {
mutex_exit(&zv->zv_state_lock);
continue;
}
zvol_remove(zv);
if (zv == NULL) {
rw_exit(&zvol_state_lock);
return;
}

zvol_os_clear_private(zv);
mutex_exit(&zv->zv_state_lock);
break;
} else {
mutex_exit(&zv->zv_state_lock);
}
ASSERT(MUTEX_HELD(&zv->zv_state_lock));

if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
/*
* In use, so try to throw everyone off, then wait
* until finished.
*/
zv->zv_flags |= ZVOL_REMOVING;
mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);
zvol_remove_minor_task(zv);
return;
}

/* Drop zvol_state_lock before calling zvol_free() */
zvol_remove(zv);
zvol_os_clear_private(zv);

mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);

if (zv != NULL)
zvol_os_free(zv);
zvol_os_free(zv);
}

/*
Expand Down
4 changes: 0 additions & 4 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -350,10 +350,6 @@ elif sys.platform.startswith('linux'):
'mmp/mmp_active_import': ['FAIL', known_reason],
'mmp/mmp_exported_import': ['FAIL', known_reason],
'mmp/mmp_inactive_import': ['FAIL', known_reason],
'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872],
'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621],
'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872],
'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason],
})

# Not all Github actions runners have scsi_debug module, so we may skip
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,6 @@ fi

if ! is_linux ; then
log_unsupported "Only linux supports dd with oflag=dsync for FUA writes"
else
if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi

# Disabled for the CentOS 9 kernel
if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi
fi

typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,6 @@
verify_runnable "global"

if is_linux ; then
if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi

# Disabled for the CentOS 9 kernel
if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi

# We need '--force' here since the prior tests may leave a filesystem
# on the zvol, and blkdiscard will see that filesystem and print a
# warning unless you force it.
Expand Down
Loading