diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 506b529dce48..ae2e9da9108d 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -554,6 +554,10 @@ for_each_vdev_run_cb(void *zhp_data, nvlist_t *nv, void *cb_vcdl) if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return (1); + /* Make sure we're getting the updated enclosure sysfs path */ + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vdev_enc_sysfs_path); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index c143d637059d..9baa9307f05b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -353,7 +353,7 @@ get_usage(zpool_help_t idx) return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: - return (gettext("\tclear [-nF] [device]\n")); + return (gettext("\tclear [[-1]|[-nF]] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" @@ -389,9 +389,10 @@ get_usage(zpool_help_t idx) "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: - return (gettext("\toffline [-f] [-t] ...\n")); + return (gettext("\toffline [-0]|[[-f][-t]] " + "...\n")); case HELP_ONLINE: - return (gettext("\tonline [-e] ...\n")); + return (gettext("\tonline [-1e] ...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); @@ -410,7 +411,7 @@ get_usage(zpool_help_t idx) return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: - return (gettext("\tstatus [-c [script1,script2,...]] " + return (gettext("\tstatus [-1] [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: @@ -443,6 +444,19 @@ get_usage(zpool_help_t idx) } } +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +static void +fsleep(float sec) +{ + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + nanosleep(&req, NULL); +} + static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { @@ -516,6 +530,319 @@ print_vdev_prop_cb(int prop, void *cb) return (ZPROP_CONT); } +/* + * Given a power string: "on", "off", "1", or "0", return 0 if it's an + * off value, 1 if it's an on value, and -1 if the value is unrecognized. + */ +static int zpool_power_parse_value(char *str) +{ + if ((strcmp(str, "off") == 0) || (strcmp(str, "0") == 0)) + return (0); + + if ((strcmp(str, "on") == 0) || (strcmp(str, "1") == 0)) + return (1); + + return (-1); +} + +/* + * Read from a sysfs file and return an allocated string. Removes + * the newline from the end of the string if there is one. + * + * Returns a string on success (which must be freed), or NULL on error. + */ +static char *zpool_sysfs_gets(char *path) +{ + int fd; + struct stat statbuf; + char *buf = NULL; + ssize_t count = 0; + fd = open(path, O_RDONLY); + if (fd < 0) + return (NULL); + + if (fstat(fd, &statbuf) != 0) { + close(fd); + return (NULL); + } + + buf = calloc(sizeof (*buf), statbuf.st_size + 1); + if (!buf) { + close(fd); + return (NULL); + } + + /* + * Note, we can read less bytes than st_size, and that's ok. Sysfs + * files will report their size is 4k even if they only return a small + * string. + */ + count = read(fd, buf, statbuf.st_size); + if (count < 0) { + /* Error doing read() or we overran the buffer */ + close(fd); + free(buf); + return (NULL); + } + + /* Remove trailing newline */ + if (buf[count - 1] == '\n') + buf[count - 1] = 0; + + close(fd); + + return (buf); +} + +/* + * Write a string to a sysfs file. + * + * Returns 0 on success, non-zero otherwise. + */ +static int zpool_sysfs_puts(char *path, char *str) +{ + FILE *file; + + file = fopen(path, "w"); + if (!file) { + return (-1); + } + + if (fputs(str, file) < 0) { + fclose(file); + return (-2); + } + fclose(file); + return (0); +} + +/* Given a vdev nvlist_t, rescan its enclosure sysfs path */ +static void +rescan_vdev_config_dev_sysfs_path(nvlist_t *vdev_nv) +{ + update_vdev_config_dev_sysfs_path(vdev_nv, + fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH), + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); +} + +/* + * Given a leaf vdev name like 'L5' return its VDEV_CONFIG_PATH like + * '/dev/disk/by-vdev/L5'. + */ +static const char * +vdev_name_to_path(zpool_handle_t *zhp, char *vdev) +{ + nvlist_t *vdev_nv = NULL; + vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); + if (vdev_nv == NULL) { + return (NULL); + } + return (fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH)); +} + +/* + * Given a vdev string return an allocated string containing the sysfs path to + * its power control file. Also do a check if the power control file really + * exists and has correct permissions. + * + * Example returned strings: + * + * /sys/class/enclosure/0:0:122:0/10/power_status + * /sys/bus/pci/slots/10/power + * + * Returns allocated string on success (which must be freed), NULL on failure. + */ +static char * +zpool_power_sysfs_path(zpool_handle_t *zhp, char *vdev) +{ + nvlist_t *vdev_nv = NULL; + const char *enc_sysfs_dir = NULL; + char *path = NULL; + + vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); + if (vdev_nv == NULL) { + return (NULL); + } + + /* Make sure we're getting the updated enclosure sysfs path */ + rescan_vdev_config_dev_sysfs_path(vdev_nv); + + if (nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_dir) != 0) { + return (NULL); + } + + if (asprintf(&path, "%s/power_status", enc_sysfs_dir) == -1) + return (NULL); + + if (access(path, W_OK) != 0) { + free(path); + path = NULL; + /* No HDD 'power_control' file, maybe it's NVMe? */ + if (asprintf(&path, "%s/power", enc_sysfs_dir) == -1) { + return (NULL); + } + + if (access(path, R_OK | W_OK) != 0) { + /* Not NVMe either */ + free(path); + return (NULL); + } + } + + return (path); +} + +/* + * Given a path to a sysfs power control file, return B_TRUE if you should use + * "on/off" words to control it, or B_FALSE otherwise ("0/1" to control). + */ +static boolean_t +zpool_power_use_word(char *sysfs_path) +{ + if (strcmp(&sysfs_path[strlen(sysfs_path) - strlen("power_status")], + "power_status") == 0) { + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Check the sysfs power control value for a vdev. + * + * Returns: + * 0 - Power is off + * 1 - Power is on + * -1 - Error + */ +static int +zpool_power_current_state(zpool_handle_t *zhp, char *vdev) +{ + char *path; + char *val; + int rc; + path = zpool_power_sysfs_path(zhp, vdev); + if (path == NULL) + return (-1); + + val = zpool_sysfs_gets(path); + if (val == NULL) { + free(path); + return (-1); + } + + rc = zpool_power_parse_value(val); + free(val); + free(path); + return (rc); +} + +/* + * Turn on or off the slot to a device + * + * Device path is the full path to the device (like /dev/sda or /dev/sda1). + * + * Return code: + * + * 0: Success + * 1: Couldn't read current power state + * 2: Couldn't lookup sysfs path + * 3: Couldn't write new power value + * 4: Power value didn't change to expected value + */ +static int +zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on) +{ + char *sysfs_path; + const char *val; + int rc; + + rc = zpool_power_current_state(zhp, vdev); + if (rc == -1) { + return (1); + } + + /* Already correct value? */ + if (rc == (int)turn_on) + return (0); + + sysfs_path = zpool_power_sysfs_path(zhp, vdev); + if (sysfs_path == NULL) + return (2); + + if (zpool_power_use_word(sysfs_path)) { + val = turn_on ? "on" : "off"; + } else { + val = turn_on ? "1" : "0"; + } + + rc = zpool_sysfs_puts(sysfs_path, (char *)val); + + free(sysfs_path); + if (rc != 0) { + return (3); + } + + /* + * Wait up to 2 seconds for sysfs power value to change after + * writing it. + */ + for (int i = 0; i < 10; i++) { + rc = zpool_power_current_state(zhp, vdev); + if (rc == (int)turn_on) + return (0); /* success */ + + fsleep(0.2); /* 200ms */ + } + + /* sysfs value never changed */ + return (4); +} + +static int +zpool_power_on(zpool_handle_t *zhp, char *vdev) +{ + return (zpool_power(zhp, vdev, B_TRUE)); +} + +static int +zpool_power_on_and_disk_wait(zpool_handle_t *zhp, char *vdev) +{ + zpool_power_on(zhp, vdev); + zpool_disk_wait(vdev_name_to_path(zhp, vdev)); + + return (0); +} + +static void +zpool_power_on_pool_and_wait_for_devices(zpool_handle_t *zhp) +{ + nvlist_t *nv; + const char *path = NULL; + + /* Power up all the devices first */ + FOR_EACH_LEAF_VDEV(zhp, nv) { + path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + zpool_power_on(zhp, (char *)path); + } + + /* + * Wait for their devices to show up. Since we powered them on + * at roughly the same time, they should all come online around + * the same time. + */ + FOR_EACH_LEAF_VDEV(zhp, nv) { + path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + zpool_disk_wait(path); + } +} + +static int +zpool_power_off(zpool_handle_t *zhp, char *vdev) +{ + return (zpool_power(zhp, vdev, B_FALSE)); +} + /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display @@ -2093,6 +2420,7 @@ typedef struct status_cbdata { boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; + boolean_t cb_print_power; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ @@ -2378,6 +2706,26 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, else printf(" %5s", rbuf); } + if (cb->cb_print_power) { + if (children == 0) { + /* Only leaf vdevs have physical slots */ + switch (zpool_power_current_state(zhp, (char *) + fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH))) { + case 0: + printf_color(ANSI_RED, " %5s", + gettext("off")); + break; + case 1: + printf(" %5s", gettext("on")); + break; + default: + printf(" %5s", "-"); + } + } else { + printf(" %5s", "-"); + } + } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, @@ -5428,19 +5776,6 @@ get_interval_count_filter_guids(int *argc, char **argv, float *interval, interval, count); } -/* - * Floating point sleep(). Allows you to pass in a floating point value for - * seconds. - */ -static void -fsleep(float sec) -{ - struct timespec req; - req.tv_sec = floor(sec); - req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; - nanosleep(&req, NULL); -} - /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. @@ -6963,13 +7298,17 @@ zpool_do_online(int argc, char **argv) int ret = 0; vdev_state_t newstate; int flags = 0; + boolean_t is_power_on = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "e")) != -1) { + while ((c = getopt(argc, argv, "1e")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; + case '1': + is_power_on = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6977,6 +7316,9 @@ zpool_do_online(int argc, char **argv) } } + if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) + is_power_on = B_TRUE; + argc -= optind; argv += optind; @@ -6998,6 +7340,11 @@ zpool_do_online(int argc, char **argv) for (i = 1; i < argc; i++) { vdev_state_t oldstate; boolean_t avail_spare, l2cache; + + if (is_power_on) { + zpool_power_on_and_disk_wait(zhp, argv[i]); + } + nvlist_t *tgt = zpool_find_vdev(zhp, argv[i], &avail_spare, &l2cache, NULL); if (tgt == NULL) { @@ -7043,7 +7390,9 @@ zpool_do_online(int argc, char **argv) } /* - * zpool offline [-ft] ... + * zpool offline [-0]|[-ft] ... + * + * -0 Power off the enclosure slot to the drive (if possible) * * -f Force the device into a faulted state. * @@ -7059,9 +7408,10 @@ zpool_do_offline(int argc, char **argv) int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; + boolean_t is_power_off = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "ft")) != -1) { + while ((c = getopt(argc, argv, "0ft")) != -1) { switch (c) { case 'f': fault = B_TRUE; @@ -7069,6 +7419,9 @@ zpool_do_offline(int argc, char **argv) case 't': istmp = B_TRUE; break; + case '0': + is_power_off = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -7076,6 +7429,20 @@ zpool_do_offline(int argc, char **argv) } } + if (is_power_off && fault) { + (void) fprintf(stderr, + gettext("-0 and -f cannot be used together\n")); + usage(B_FALSE); + return (1); + } + + if (is_power_off && istmp) { + (void) fprintf(stderr, + gettext("-0 and -t cannot be used together\n")); + usage(B_FALSE); + return (1); + } + argc -= optind; argv += optind; @@ -7095,8 +7462,22 @@ zpool_do_offline(int argc, char **argv) return (1); for (i = 1; i < argc; i++) { - if (fault) { - uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + if (is_power_off) { + /* + * Note: we have to power off first, then set REMOVED, + * or else zpool_vdev_set_removed_state() returns + * EAGAIN. + */ + ret = zpool_power_off(zhp, argv[i]); + if (ret != 0) { + (void) fprintf(stderr, "%s %s %d\n", + gettext("unable to power off slot for"), + argv[i], ret); + } + zpool_vdev_set_removed_state(zhp, guid, VDEV_AUX_NONE); + + } else if (fault) { vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ @@ -7111,6 +7492,7 @@ zpool_do_offline(int argc, char **argv) if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } + } zpool_close(zhp); @@ -7119,7 +7501,7 @@ zpool_do_offline(int argc, char **argv) } /* - * zpool clear [device] + * zpool clear [-1]|[-nF] [device] * * Clear all errors associated with a pool or a particular device. */ @@ -7131,14 +7513,18 @@ zpool_do_clear(int argc, char **argv) boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; + boolean_t is_power_on = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ - while ((c = getopt(argc, argv, "FnX")) != -1) { + while ((c = getopt(argc, argv, "1FnX")) != -1) { switch (c) { + case '1': + is_power_on = B_TRUE; + break; case 'F': do_rewind = B_TRUE; break; @@ -7155,6 +7541,9 @@ zpool_do_clear(int argc, char **argv) } } + if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) + is_power_on = B_TRUE; + argc -= optind; argv += optind; @@ -7195,6 +7584,14 @@ zpool_do_clear(int argc, char **argv) return (1); } + if (is_power_on) { + if (device == NULL) { + zpool_power_on_pool_and_wait_for_devices(zhp); + } else { + zpool_power_on_and_disk_wait(zhp, device); + } + } + if (zpool_clear(zhp, device, policy) != 0) ret = 1; @@ -8907,6 +9304,10 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } + if (cbp->cb_print_power) { + printf_color(ANSI_BOLD, " %5s", gettext("POWER")); + } + if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); @@ -8956,6 +9357,7 @@ status_callback(zpool_handle_t *zhp, void *data) * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... * [interval [count]] * + * -1 Display vdev enclosure slot power status * -c CMD For each vdev, run command CMD * -i Display vdev initialization status. * -g Display guid for individual vdev name. @@ -8982,8 +9384,11 @@ zpool_do_status(int argc, char **argv) char *cmd = NULL; /* check options */ - while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { + while ((c = getopt(argc, argv, "1c:igLpPsvxDtT:")) != -1) { switch (c) { + case '1': + cb.cb_print_power = B_TRUE; + break; case 'c': if (cmd != NULL) { fprintf(stderr, diff --git a/include/libzfs.h b/include/libzfs.h index dbb6340b0a43..4f06b5d3c24c 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -319,6 +319,9 @@ _LIBZFS_H int zpool_vdev_remove_wanted(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); +_LIBZFS_H int zpool_vdev_set_removed_state(zpool_handle_t *, uint64_t, + vdev_aux_t); + _LIBZFS_H int zpool_vdev_clear(zpool_handle_t *, uint64_t); _LIBZFS_H nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, diff --git a/include/libzutil.h b/include/libzutil.h index 9842c225b6f0..43258d4538f4 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -97,6 +97,7 @@ _LIBZUTIL_H int zpool_find_config(libpc_handle_t *, const char *, nvlist_t **, _LIBZUTIL_H const char * const * zpool_default_search_paths(size_t *count); _LIBZUTIL_H int zpool_read_label(int, nvlist_t **, int *); _LIBZUTIL_H int zpool_label_disk_wait(const char *, int); +_LIBZUTIL_H int zpool_disk_wait(const char *); struct udev_device; @@ -205,6 +206,53 @@ _LIBZUTIL_H void zfs_setproctitle(const char *fmt, ...); typedef int (*pool_vdev_iter_f)(void *, nvlist_t *, void *); int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, void *data); +int for_each_vdev_macro_helper_func(void *zhp_data, nvlist_t *nv, void *data); +int for_each_leaf_vdev_macro_helper_func(void *zhp_data, nvlist_t *nv, + void *data); +/* + * Often you'll want to iterate over all the vdevs in the pool, but don't want + * to use for_each_vdev() since it requires a callback function. + * + * Instead you can use FOR_EACH_VDEV(): + * + * zpool_handle_t *zhp // Assume this is initialized + * nvlist_t *nv + * ... + * FOR_EACH_VDEV(zhp, nv) { + * const char *path = NULL; + * nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path); + * printf("Looking at vdev %s\n", path); + * } + * + * Note: FOR_EACH_VDEV runs in O(n^2) time where n = number of vdevs. However, + * there's an upper limit of 256 vdevs per dRAID top-level vdevs (TLDs), 255 for + * raidz2 TLDs, a real world limit of ~500 vdevs for mirrors, so this shouldn't + * really be an issue. + * + * Here are some micro-benchmarks of a complete FOR_EACH_VDEV loop on a RAID0 + * pool: + * + * 100 vdevs = 0.7ms + * 500 vdevs = 17ms + * 750 vdevs = 40ms + * 1000 vdevs = 82ms + * + * The NULL at the end of the for() loop gets around a "comma or semicolon + * followed by non-blank" checkstyle error. + */ +#define __FOR_EACH_VDEV(__zhp, __nv, __func) { \ + __nv = zpool_get_config(__zhp, NULL); \ + VERIFY0(nvlist_lookup_nvlist(__nv, ZPOOL_CONFIG_VDEV_TREE, &__nv)); \ + } \ + for (nvlist_t *__root_nv = __nv, *__state = (nvlist_t *)0; \ + for_each_vdev_cb(&__state, __root_nv, __func, &__nv) == 1; NULL) + +#define FOR_EACH_VDEV(__zhp, __nv) \ + __FOR_EACH_VDEV(__zhp, __nv, for_each_vdev_macro_helper_func) + +#define FOR_EACH_LEAF_VDEV(__zhp, __nv) \ + __FOR_EACH_VDEV(__zhp, __nv, for_each_leaf_vdev_macro_helper_func) + int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data); void update_vdevs_config_dev_sysfs_path(nvlist_t *config); diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 71cf029deff5..c7b8617ef35e 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3036,6 +3036,9 @@ zpool_vdev_is_interior(const char *name) return (B_FALSE); } +/* + * Lookup the nvlist for a given vdev. + */ nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) @@ -3043,6 +3046,7 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; + boolean_t __avail_spare, __l2cache, __log; search = fnvlist_alloc(); @@ -3058,6 +3062,18 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); + /* + * User can pass NULL for avail_spare, l2cache, and log, but + * we still need to provide variables to vdev_to_nvlist_iter(), so + * just point them to junk variables here. + */ + if (!avail_spare) + avail_spare = &__avail_spare; + if (!l2cache) + l2cache = &__l2cache; + if (!log) + log = &__log; + *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) @@ -3313,21 +3329,23 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) } /* - * Mark the given vdev degraded. + * Generic set vdev state function */ -int -zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +static int +zpool_vdev_set_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux, + vdev_state_t state) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); + dgettext(TEXT_DOMAIN, "cannot set %s %llu"), + zpool_state_to_name(state, aux), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; - zc.zc_cookie = VDEV_STATE_DEGRADED; + zc.zc_cookie = state; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) @@ -3336,6 +3354,27 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) return (zpool_standard_error(hdl, errno, errbuf)); } +/* + * Mark the given vdev degraded. + */ +int +zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_DEGRADED)); +} + +/* + * Mark the given vdev as in a removed state (as if the device does not exist). + * + * This is different than zpool_vdev_remove() which does a removal of a device + * from the pool (but the device does exist). + */ +int +zpool_vdev_set_removed_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_REMOVED)); +} + /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index fbfae4f7e685..1690b82cab5b 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -170,25 +170,17 @@ zpool_open_func(void *arg) if (rn->rn_labelpaths) { const char *path = NULL; const char *devid = NULL; - const char *env = NULL; rdsk_node_t *slice; avl_index_t where; - int timeout; int error; if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) return; - env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); - if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || - timeout < 0) { - timeout = DISK_LABEL_WAIT; - } - /* * Allow devlinks to stabilize so all paths are available. */ - zpool_label_disk_wait(rn->rn_name, timeout); + zpool_disk_wait(rn->rn_name); if (path != NULL) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); @@ -682,6 +674,24 @@ zpool_label_disk_wait(const char *path, int timeout_ms) #endif /* HAVE_LIBUDEV */ } +/* + * Simplified version of zpool_label_disk_wait() where we wait for a device + * to appear using the default timeouts. + */ +int +zpool_disk_wait(const char *path) +{ + char *env; + int timeout; + env = secure_getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); + if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || + timeout < 0) { + timeout = DISK_LABEL_WAIT; + } + + return (zpool_label_disk_wait(path, timeout)); +} + /* * Encode the persistent devices strings * used for the vdev disk label @@ -767,6 +777,10 @@ encode_device_strings(const char *path, vdev_dev_strs_t *ds, * in the nvlist * (if applicable). Like: * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' * + * If an old path was in the nvlist, and the rescan can not find a new path, + * then keep the old path, since the disk may have been removed. + * + * path: The vdev path (value from ZPOOL_CONFIG_PATH) * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH) */ void @@ -774,6 +788,9 @@ update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, const char *key) { char *upath, *spath; + const char *oldpath = NULL; + + (void) nvlist_lookup_string(nv, key, &oldpath); /* Add enclosure sysfs path (if disk is in an enclosure). */ upath = zfs_get_underlying_path(path); @@ -782,7 +799,14 @@ update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, if (spath) { (void) nvlist_add_string(nv, key, spath); } else { - (void) nvlist_remove_all(nv, key); + /* + * We couldn't dynamically scan the disk's enclosure sysfs path. + * This could be because the disk went away. If there's an old + * enclosure sysfs path in the nvlist, then keep using it. + */ + if (!oldpath) { + (void) nvlist_remove_all(nv, key); + } } free(upath); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 19d8a4742813..4a20a77eed8a 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1898,6 +1898,85 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp, return (0); } +static boolean_t +vdev_is_leaf(nvlist_t *nv) +{ + uint_t children = 0; + nvlist_t **child; + + (void) nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + return (children == 0); +} + +/* + * This function is called by our FOR_EACH_VDEV() macros. + * + * state: State machine status (stored inside of a (nvlist_t *)) + * nv: The current vdev nvlist_t we are iterating over. + * last_nv: The previous vdev nvlist_t we returned to the user in + * the last iteration of FOR_EACH_VDEV(). We use it + * to find the next vdev nvlist_t we should return. + * leafs_only: Only return leaf vdevs. + * + * Returns 1 if we found the next vdev nvlist_t for this iteration. 0 if + * we're still searching for it. + */ +static int +__for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, + boolean_t leafs_only) +{ + enum {FIRST_NV = 0, NEXT_IS_MATCH = 1, STOP_LOOKING = 2}; + + /* The very first entry in the NV list is a special case */ + if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { + if (leafs_only && !vdev_is_leaf(nv)) + return (0); + + *((nvlist_t **)last_nv) = nv; + *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; + return (1); + } + + /* + * We came across our last_nv, meaning the next one is the one we + * want + */ + if ((nv == *((nvlist_t **)last_nv))) { + /* Next iteration of this function will return the nvlist_t */ + *((nvlist_t **)state) = (nvlist_t *)NEXT_IS_MATCH; + return (0); + } + + /* + * We marked NEXT_IS_MATCH on the previous iteration, so this is the one + * we want. + */ + if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { + if (leafs_only && !vdev_is_leaf(nv)) + return (0); + + *((nvlist_t **)last_nv) = nv; + *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; + return (1); + } + + return (0); +} + +int +for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv) +{ + return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_FALSE)); +} + +int +for_each_leaf_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv) +{ + return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_TRUE)); +} + /* * Internal function for iterating over the vdevs. * diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index 7b9d40c74ebd..5732838942be 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -36,6 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm clear +.Op Fl 1 .Ar pool .Oo Ar device Oc Ns … . @@ -52,6 +53,15 @@ Pools with enabled which have been suspended cannot be resumed. While the pool was suspended, it may have been imported on another host, and resuming I/O could result in pool damage. +.Bl -tag -width Ds +.It Fl 1 +Power on the devices's slot in the storage enclosure and wait for the device +to show up before attempting to clear errors. +This is done on all the devices specified. +Alternatively, you can set the +.Sy ZPOOL_AUTO_POWER_ON_SLOT +environment variable to always enable this behavior. +.El . .Sh SEE ALSO .Xr zdb 8 , diff --git a/man/man8/zpool-offline.8 b/man/man8/zpool-offline.8 index edcf1d06ab67..d0090c994e62 100644 --- a/man/man8/zpool-offline.8 +++ b/man/man8/zpool-offline.8 @@ -36,12 +36,13 @@ .Sh SYNOPSIS .Nm zpool .Cm offline -.Op Fl ft +.Op Fl Sy 0 Ns | Ns Op Fl Sy ft .Ar pool .Ar device Ns … .Nm zpool .Cm online -.Op Fl e +.Op Fl Sy 1 +.Op Fl Sy e .Ar pool .Ar device Ns … . @@ -50,7 +51,7 @@ .It Xo .Nm zpool .Cm offline -.Op Fl ft +.Op Fl Sy 0 Ns | Ns Op Fl Sy ft .Ar pool .Ar device Ns … .Xc @@ -60,6 +61,8 @@ While the is offline, no attempt is made to read or write to the device. This command is not applicable to spares. .Bl -tag -width Ds +.It Fl 0 +Power off the device's slot in the storage enclosure. .It Fl f Force fault. Instead of offlining the disk, put it into a faulted state. @@ -73,6 +76,7 @@ Upon reboot, the specified physical device reverts to its previous state. .It Xo .Nm zpool .Cm online +.Op Fl 1 .Op Fl e .Ar pool .Ar device Ns … @@ -80,6 +84,12 @@ Upon reboot, the specified physical device reverts to its previous state. Brings the specified physical device online. This command is not applicable to spares. .Bl -tag -width Ds +.It Fl 1 +Power on the device's slot in the storage enclosure and wait for the device +to show up before attempting to online it. +Alternatively, you can set the +.Sy ZPOOL_AUTO_POWER_ON_SLOT +environment variable to always enable this behavior. .It Fl e Expand the device to use all available space. If the device is part of a mirror or raidz then all devices must be expanded diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 8f9580cf086e..becf1ffda73e 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -57,6 +57,8 @@ and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change. .Bl -tag -width Ds +.It Fl 1 +Display vdev enclosure slot power status (on or off). .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … Run a script (or scripts) on each vdev and include the output as a new column in the diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 4c4020bdd810..e24506578979 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -456,6 +456,16 @@ Use ANSI color in and .Nm zpool Cm iostat output. +.It Sy ZPOOL_AUTO_POWER_ON_SLOT +Automatically attempt to turn on the drives enclosure slot power to a drive when +running the +.Nm zpool Cm online +or +.Nm zpool Cm clear +commands. +This has the same effect as passing the +.Fl 1 +option to those commands. .It Sy ZPOOL_IMPORT_PATH The search path for devices or files to use with the pool. This is a colon-separated list of directories in which