diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 70a4ed46f263..50f2ed2e8aa5 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4760,12 +4760,12 @@ print_label_numbers(const char *prefix, const cksum_record_t *rec) putchar('\n'); } -#define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) +#define MAX_UBERBLOCK_COUNT (VDEV_LARGE_UBERBLOCK_RING >> UBERBLOCK_SHIFT) typedef struct zdb_label { - vdev_label_t label; uint64_t label_offset; nvlist_t *config_nv; + char *ub_array; cksum_record_t *config; cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; boolean_t header_printed; @@ -4774,7 +4774,7 @@ typedef struct zdb_label { } zdb_label_t; static void -print_label_header(zdb_label_t *label, int l) +print_label_header(zdb_label_t *label, boolean_t large_label, int l) { if (dump_opt['q']) @@ -4784,7 +4784,7 @@ print_label_header(zdb_label_t *label, int l) return; (void) printf("------------------------------------\n"); - (void) printf("LABEL %d %s\n", l, + (void) printf("LABEL(%s) %d %s\n", large_label ? "new" : "old", l, label->cksum_valid ? "" : "(Bad label cksum)"); (void) printf("------------------------------------\n"); @@ -4990,7 +4990,7 @@ dump_l2arc_header(int fd) int error = B_FALSE; if (pread64(fd, &l2dhdr, sizeof (l2dhdr), - VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { + VDEV_OLD_LABEL_START_SIZE) != sizeof (l2dhdr)) { error = B_TRUE; } else { if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) @@ -5062,7 +5062,8 @@ dump_l2arc_header(int fd) } static void -dump_config_from_label(zdb_label_t *label, size_t buflen, int l) +dump_config_from_label(zdb_label_t *label, size_t buflen, boolean_t large_label, + int l) { if (dump_opt['q']) return; @@ -5070,7 +5071,7 @@ dump_config_from_label(zdb_label_t *label, size_t buflen, int l) if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) return; - print_label_header(label, l); + print_label_header(label, large_label, l); dump_nvlist(label->config_nv, 4); print_label_numbers(" labels = ", label->config); @@ -5081,23 +5082,20 @@ dump_config_from_label(zdb_label_t *label, size_t buflen, int l) #define ZDB_MAX_UB_HEADER_SIZE 32 static void -dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) +dump_label_uberblocks(zdb_label_t *label, vdev_t *vd, int label_num) { - - vdev_t vd; + boolean_t large_label = vd->vdev_large_label; char header[ZDB_MAX_UB_HEADER_SIZE]; - vd.vdev_ashift = ashift; - vd.vdev_top = &vd; - - for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { - uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); - uberblock_t *ub = (void *)((char *)&label->label + uoff); + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vd); i++) { + uint64_t uoff = i << VDEV_UBERBLOCK_SHIFT(vd); + uberblock_t *ub = (void *)(label->ub_array + uoff); cksum_record_t *rec = label->uberblocks[i]; if (rec == NULL) { if (dump_opt['u'] >= 2) { - print_label_header(label, label_num); + print_label_header(label, large_label, + label_num); (void) printf(" Uberblock[%d] invalid\n", i); } continue; @@ -5108,10 +5106,10 @@ dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) if ((dump_opt['u'] < 4) && (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && - (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) + (i >= VDEV_UBERBLOCK_COUNT(vd) - MMP_BLOCKS_PER_LABEL)) continue; - print_label_header(label, label_num); + print_label_header(label, large_label, label_num); (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, " Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); @@ -5383,7 +5381,7 @@ zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) } static boolean_t -label_cksum_valid(vdev_label_t *label, uint64_t offset) +phys_cksum_valid(void *data, uint64_t offset, uint64_t size) { zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; zio_cksum_t expected_cksum; @@ -5392,10 +5390,8 @@ label_cksum_valid(vdev_label_t *label, uint64_t offset) zio_eck_t *eck; int byteswap; - void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys); - eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; + eck = (zio_eck_t *)((char *)(data) + size) - 1; - offset += offsetof(vdev_label_t, vl_vdev_phys); ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); @@ -5405,8 +5401,8 @@ label_cksum_valid(vdev_label_t *label, uint64_t offset) expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; - abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); - ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); + abd_t *abd = abd_get_from_buf(data, size); + ci->ci_func[byteswap](abd, size, NULL, &actual_cksum); abd_free(abd); if (byteswap) @@ -5418,12 +5414,97 @@ label_cksum_valid(vdev_label_t *label, uint64_t offset) return (B_FALSE); } +static boolean_t +label_cksum_valid(vdev_label_t *label, uint64_t offset) +{ + return (phys_cksum_valid(&label->vl_vdev_phys, + offset + offsetof(vdev_label_t, vl_vdev_phys), VDEV_PHYS_SIZE)); +} + +static nvlist_t * +vdev_config_lookup(nvlist_t *vdev_config, uint64_t guid) +{ + if (fnvlist_lookup_uint64(vdev_config, ZPOOL_CONFIG_GUID) == guid) + return (vdev_config); + + nvlist_t **children; + uint_t child_count; + if (nvlist_lookup_nvlist_array(vdev_config, ZPOOL_CONFIG_CHILDREN, + &children, &child_count) != 0) + return (NULL); + for (int c = 0; c < child_count; c++) { + nvlist_t *child; + if ((child = vdev_config_lookup(children[c], guid)) != + NULL) + return (child); + } + + return (NULL); +} + +static boolean_t +has_large_label(int fd, uint64_t psize) +{ + for (int l = 0; l < VDEV_LABELS / 2; l++) { + vdev_label_t label; + nvlist_t *config; + uint64_t offset = vdev_label_offset(psize, l, 0, B_FALSE); + + if (pread64(fd, &label, sizeof (label), offset) != + sizeof (label)) + continue; + if (!label_cksum_valid(&label, offset)) + continue; + + char *buf = label.vl_vdev_phys.vp_nvlist; + size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); + if (nvlist_unpack(buf, buflen, &config, 0) != 0) + continue; + uint64_t pool_state; + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0 && (pool_state == POOL_STATE_SPARE || + pool_state == POOL_STATE_L2CACHE)) { + boolean_t retry_new = B_FALSE; + (void) nvlist_lookup_boolean_value(config, + ZPOOL_CONFIG_LARGE_LABEL, &retry_new); + if (retry_new) + return (B_TRUE); + else + continue; + } + nvlist_t *vdev_tree; + if (nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + continue; + size_t size; + if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) + size = buflen; + + boolean_t retry_new = B_FALSE; + if (fnvlist_lookup_uint64(config, + ZPOOL_CONFIG_VERSION) >= SPA_VERSION_FEATURES) { + uint64_t guid = fnvlist_lookup_uint64(config, + ZPOOL_CONFIG_GUID); + nvlist_t *child = vdev_config_lookup(vdev_tree, guid); + ASSERT(child); + ASSERT3U(guid, ==, + fnvlist_lookup_uint64(child, ZPOOL_CONFIG_GUID)); + (void) nvlist_lookup_boolean_value(child, + ZPOOL_CONFIG_LARGE_LABEL, &retry_new); + } + if (retry_new) { + return (B_TRUE); + } + } + return (B_FALSE); +} + static int dump_label(const char *dev) { char path[MAXPATHLEN]; - zdb_label_t labels[VDEV_LABELS] = {{{{0}}}}; - uint64_t psize, ashift, l2cache; + zdb_label_t labels[VDEV_LABELS] = {{0}}; + uint64_t psize, ashift, l2cache, ub_size; struct stat64 statbuf; boolean_t config_found = B_FALSE; boolean_t error = B_FALSE; @@ -5479,6 +5560,10 @@ dump_label(const char *dev) psize = statbuf.st_size; psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); ashift = SPA_MINBLOCKSHIFT; + boolean_t large_label = has_large_label(fd, psize); + vdev_t vd; + vd.vdev_top = &vd; + vd.vdev_large_label = large_label; /* * 1. Read the label from disk @@ -5488,27 +5573,111 @@ dump_label(const char *dev) */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; - char *buf = label->label.vl_vdev_phys.vp_nvlist; - size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + char *buf; + size_t buflen; nvlist_t *config; cksum_record_t *rec; zio_cksum_t cksum; - vdev_t vd; - label->label_offset = vdev_label_offset(psize, l, 0); + label->label_offset = vdev_label_offset(psize, l, 0, + large_label); + + if (large_label) { + char toc_buf[VDEV_TOC_SIZE]; + if (pread64(fd, toc_buf, VDEV_TOC_SIZE, + label->label_offset + VDEV_LARGE_PAD_SIZE) != + VDEV_TOC_SIZE) { + if (!dump_opt['q']) + (void) printf("failed to read label " + "%d\n", l); + label->read_failed = B_TRUE; + error = B_TRUE; + continue; + } - if (pread64(fd, &label->label, sizeof (label->label), - label->label_offset) != sizeof (label->label)) { - if (!dump_opt['q']) - (void) printf("failed to read label %d\n", l); - label->read_failed = B_TRUE; - error = B_TRUE; - continue; - } + label->cksum_valid = + phys_cksum_valid(toc_buf, + label->label_offset + VDEV_LARGE_PAD_SIZE, + VDEV_TOC_SIZE); + + label->read_failed = B_FALSE; + nvlist_t *toc; + if (nvlist_unpack(toc_buf, VDEV_TOC_SIZE, &toc, 0)) { + if (!dump_opt['q']) + (void) printf("failed to unpack TOC of " + "label %d\n", l); + error = B_TRUE; + continue; + } - label->read_failed = B_FALSE; - label->cksum_valid = label_cksum_valid(&label->label, - label->label_offset); + if (dump_opt['l'] > 2) + nvlist_print(stdout, toc); + + uint32_t conf_size, conf_off; + if (!vdev_toc_get_secinfo(toc, VDEV_TOC_VDEV_CONFIG, + &conf_size, &conf_off)) { + if (!dump_opt['q']) + (void) printf("failed to read size of " + "vdev config of label %d\n", l); + error = B_TRUE; + fnvlist_free(toc); + continue; + } + fnvlist_free(toc); + buf = alloca(conf_size); + buflen = conf_size; + uint64_t phys_off = label->label_offset + + VDEV_LARGE_PAD_SIZE + conf_off; + if (pread64(fd, buf, conf_size, phys_off) != + conf_size) { + if (!dump_opt['q']) + (void) printf("failed to read " + "vdev config of label %d\n", l); + error = B_TRUE; + continue; + } + + label->cksum_valid = label->cksum_valid && + phys_cksum_valid(buf, phys_off, + conf_size); + ub_size = VDEV_LARGE_UBERBLOCK_RING; + label->ub_array = malloc(ub_size); + + if (pread64(fd, label->ub_array, ub_size, + label->label_offset + VDEV_LARGE_UBERBLOCK_RING) != + ub_size) { + if (!dump_opt['q']) + (void) printf("failed to read " + "uberblocks for label %d\n", l); + label->read_failed = B_TRUE; + error = B_TRUE; + continue; + } + } else { + vdev_label_t vl; + if (pread64(fd, &vl, sizeof (vl), + label->label_offset) != sizeof (vl)) { + if (!dump_opt['q']) + (void) printf("failed to read label " + "%d\n", l); + label->read_failed = B_TRUE; + error = B_TRUE; + continue; + } + + label->read_failed = B_FALSE; + label->cksum_valid = label_cksum_valid(&vl, + label->label_offset); + + buf = alloca(sizeof (vl.vl_vdev_phys)); + buflen = sizeof (vl.vl_vdev_phys); + memcpy(buf, &vl.vl_vdev_phys, buflen); + + ub_size = VDEV_UBERBLOCK_RING; + label->ub_array = alloca(ub_size); + memcpy(label->ub_array, vl.vl_uberblock, + VDEV_UBERBLOCK_RING); + } if (nvlist_unpack(buf, buflen, &config, 0) == 0) { nvlist_t *vdev_tree = NULL; @@ -5543,11 +5712,11 @@ dump_label(const char *dev) } vd.vdev_ashift = ashift; - vd.vdev_top = &vd; + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { - uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); - uberblock_t *ub = (void *)((char *)label + uoff); + uint64_t uoff = i << VDEV_UBERBLOCK_SHIFT(&vd); + uberblock_t *ub = (void *)(label->ub_array + uoff); if (uberblock_verify(ub)) continue; @@ -5564,20 +5733,21 @@ dump_label(const char *dev) */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; - size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + size_t buflen = large_label ? VDEV_LARGE_UBERBLOCK_RING : + VDEV_PHYS_SIZE - sizeof (zio_eck_t); if (label->read_failed == B_TRUE) continue; if (label->config_nv) { - dump_config_from_label(label, buflen, l); + dump_config_from_label(label, buflen, large_label, l); } else { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); } if (dump_opt['u']) - dump_label_uberblocks(label, ashift, l); + dump_label_uberblocks(label, &vd, l); nvlist_free(label->config_nv); } @@ -5588,6 +5758,12 @@ dump_label(const char *dev) if (read_l2arc_header) error |= dump_l2arc_header(fd); + if (large_label) { + for (int l = 0; l < VDEV_LABELS; l++) + if (labels[l].ub_array) + free(labels[l].ub_array); + } + cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); diff --git a/cmd/zhack.c b/cmd/zhack.c index 8ffbf91ffb30..ac51d878334f 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -685,11 +685,11 @@ zhack_do_metaslab(int argc, char **argv) return (0); } -#define ASHIFT_UBERBLOCK_SHIFT(ashift) \ +#define ASHIFT_UBERBLOCK_SHIFT(ashift, new) \ MIN(MAX(ashift, UBERBLOCK_SHIFT), \ - MAX_UBERBLOCK_SHIFT) -#define ASHIFT_UBERBLOCK_SIZE(ashift) \ - (1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift)) + MAX_UBERBLOCK_SHIFT(new)) +#define ASHIFT_UBERBLOCK_SIZE(ashift, new) \ + (1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift, new)) #define REPAIR_LABEL_STATUS_CKSUM (1 << 0) #define REPAIR_LABEL_STATUS_UB (1 << 1) @@ -714,6 +714,26 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl, return (0); } +static int +zhack_repair_read(const int fd, uint8_t *buf, size_t buflen, + const uint64_t offset, const int l) +{ + const int err = pread64(fd, buf, buflen, offset); + + if (err == -1) { + (void) fprintf(stderr, + "error: cannot read buffer at %lu for label %d: %s\n", + offset, l, strerror(errno)); + return (err); + } else if (err != buflen) { + (void) fprintf(stderr, + "error: bad read size at %lu for label %d \n", offset, l); + return (err); + } + + return (0); +} + static int zhack_repair_get_byteswap(const zio_eck_t *vdev_eck, const int l, int *byteswap) { @@ -875,7 +895,7 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l, (char *)vl + offsetof(vdev_label_t, vl_uberblock); zio_eck_t *ub_eck = (zio_eck_t *) - ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1; + ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1; if (ub_eck->zec_magic != 0) { (void) fprintf(stderr, @@ -894,10 +914,39 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l, if (zhack_repair_write_label(l, fd, byteswap, ub_data, ub_eck, label_offset + offsetof(vdev_label_t, vl_uberblock), - ASHIFT_UBERBLOCK_SIZE(ashift))) + ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) labels_repaired[l] |= REPAIR_LABEL_STATUS_UB; } +static void +zhack_repair_write_uberblock_new(void *ub_data, const int l, + const uint64_t ashift, const int fd, const int byteswap, + const uint64_t label_offset, uint32_t *labels_repaired) +{ + zio_eck_t *ub_eck = + (zio_eck_t *) + ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1; + + if (ub_eck->zec_magic != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Expected Uberblock checksum magic number to " + "be 0, but got %" PRIu64 "\n", + l, ub_eck->zec_magic); + (void) fprintf(stderr, "It would appear there's already " + "a checksum for the uberblock.\n"); + return; + } + + + ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; + + if (zhack_repair_write_label(l, fd, byteswap, + ub_data, ub_eck, label_offset + VDEV_LARGE_UBERBLOCK_RING, + ASHIFT_UBERBLOCK_SIZE(ashift, B_TRUE))) + labels_repaired[l] |= REPAIR_LABEL_STATUS_UB; +} + static void zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum) { @@ -911,12 +960,13 @@ zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum) static int zhack_repair_test_cksum(const int byteswap, void *vdev_data, - zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l) + const uint64_t size, zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, + const int l) { const zio_cksum_t expected_cksum = vdev_eck->zec_cksum; zio_cksum_t actual_cksum; zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset, - VDEV_PHYS_SIZE, vdev_eck, &actual_cksum); + size, vdev_eck, &actual_cksum); const uint64_t expected_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; const uint64_t actual_magic = vdev_eck->zec_magic; @@ -975,8 +1025,8 @@ zhack_repair_unpack_cfg(vdev_label_t *vl, const int l, nvlist_t **cfg) static void zhack_repair_one_label(const zhack_repair_op_t op, const int fd, - vdev_label_t *vl, const uint64_t label_offset, const int l, - uint32_t *labels_repaired) + vdev_label_t *vl, const uint64_t filesize, const int l, + uint32_t *labels_repaired, boolean_t *large_label) { ssize_t err; uberblock_t *ub = (uberblock_t *)vl->vl_uberblock; @@ -984,6 +1034,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, (char *)vl + offsetof(vdev_label_t, vl_vdev_phys); zio_eck_t *vdev_eck = (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1; + const uint64_t label_offset = vdev_label_offset(filesize, l, 0, + B_FALSE); const uint64_t vdev_phys_offset = label_offset + offsetof(vdev_label_t, vl_vdev_phys); nvlist_t *cfg; @@ -1005,8 +1057,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, } if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && - zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck, - vdev_phys_offset, l) != 0) { + zhack_repair_test_cksum(byteswap, vdev_data, VDEV_PHYS_SIZE, + vdev_eck, vdev_phys_offset, l) != 0) { (void) fprintf(stderr, "It would appear checksums are " "corrupted. Try zhack repair label -c \n"); return; @@ -1016,6 +1068,9 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, if (err) return; + (void) nvlist_lookup_boolean_value(cfg, ZPOOL_CONFIG_LARGE_LABEL, + large_label); + if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) { char *buf; size_t buflen; @@ -1047,15 +1102,210 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, zhack_repair_write_uberblock(vl, l, ashift, fd, byteswap, label_offset, labels_repaired); + if (large_label) { + zhack_repair_write_uberblock_new(ub, l, ashift, + fd, byteswap, vdev_label_offset(filesize, l, 0, + B_TRUE), labels_repaired); + } } if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck, vdev_phys_offset, VDEV_PHYS_SIZE)) - labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; fsync(fd); } +static void +zhack_repair_one_label_large(const zhack_repair_op_t op, const int fd, + const uint64_t label_offset, const int l, uint32_t *labels_repaired) +{ + ssize_t err; + void *toc_data = NULL, *bootenv = NULL, *vdev_config = NULL; + void *spa_config = NULL, *ub = NULL; + /* + * Note that currently, this can't handle disks with larger than 8k + * sector sizes. That needs to be fixed eventually. + */ + toc_data = malloc(VDEV_TOC_SIZE); + err = zhack_repair_read(fd, toc_data, VDEV_TOC_SIZE, label_offset, l); + if (err) + goto out; + + zio_eck_t *toc_eck = (zio_eck_t *)(toc_data + VDEV_TOC_SIZE) - 1; + if (toc_eck->zec_magic == 0) { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum magic number to not be zero" + "\n", + l); + (void) fprintf(stderr, "There should already be a checksum " + "for the label.\n"); + goto out; + } + + int byteswap = + (toc_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)); + + if (byteswap) { + byteswap_uint64_array(&toc_eck->zec_cksum, + sizeof (zio_cksum_t)); + toc_eck->zec_magic = BSWAP_64(toc_eck->zec_magic); + } + if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && + zhack_repair_test_cksum(byteswap, toc_data, VDEV_TOC_SIZE, + toc_eck, label_offset, l) != 0) { + (void) fprintf(stderr, "It would appear checksums are " + "corrupted. Try zhack repair label -c \n"); + goto out; + } + + nvlist_t *toc; + err = nvlist_unpack(toc_data, VDEV_TOC_SIZE, &toc, 0); + if (err) { + (void) fprintf(stderr, + "error: cannot unpack nvlist TOC %d\n", l); + goto out; + } + + uint32_t bootenv_size, vc_size, sc_size; + uint32_t bootenv_offset, vc_offset, sc_offset; + if (!vdev_toc_get_secinfo(toc, VDEV_TOC_BOOT_REGION, + &bootenv_size, &bootenv_offset) || !vdev_toc_get_secinfo(toc, + VDEV_TOC_VDEV_CONFIG, &vc_size, &vc_offset) || + !vdev_toc_get_secinfo(toc, VDEV_TOC_POOL_CONFIG, &sc_size, + &sc_offset)) { + fnvlist_free(toc); + (void) fprintf(stderr, + "error: TOC missing core fields %d\n", l); + goto out; + } + fnvlist_free(toc); + bootenv = malloc(bootenv_size); + zio_eck_t *bootenv_eck = (zio_eck_t *)(bootenv + bootenv_size) - 1; + vdev_config = malloc(vc_size); + zio_eck_t *vc_eck = (zio_eck_t *)(vdev_config + vc_size) - 1; + spa_config = malloc(sc_size); + zio_eck_t *sc_eck = (zio_eck_t *)(spa_config + sc_size) - 1; + + uint64_t base_offset = label_offset; + if (bootenv_size != 0) { + if ((err = zhack_repair_read(fd, bootenv, + bootenv_size, base_offset + bootenv_offset, l))) + goto out; + if (byteswap) { + byteswap_uint64_array(&bootenv_eck->zec_cksum, + sizeof (zio_cksum_t)); + bootenv_eck->zec_magic = + BSWAP_64(bootenv_eck->zec_magic); + } + if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && + zhack_repair_test_cksum(byteswap, bootenv, bootenv_size, + bootenv_eck, base_offset + bootenv_offset, l) != 0) { + (void) fprintf(stderr, "It would appear checksums are " + "corrupted. Try zhack repair label -c \n"); + goto out; + } + } + + if ((err = zhack_repair_read(fd, vdev_config, vc_size, + base_offset + vc_offset, l))) + goto out; + + if (byteswap) { + byteswap_uint64_array(&sc_eck->zec_cksum, + sizeof (zio_cksum_t)); + vc_eck->zec_magic = BSWAP_64(vc_eck->zec_magic); + } + if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && + zhack_repair_test_cksum(byteswap, vdev_config, vc_size, + vc_eck, base_offset + vc_offset, l) != 0) { + (void) fprintf(stderr, "It would appear checksums are " + "corrupted. Try zhack repair label -c \n"); + goto out; + } + if ((err = zhack_repair_read(fd, spa_config, sc_size, + base_offset + sc_offset, l))) + goto out; + + if (byteswap) { + byteswap_uint64_array(&sc_eck->zec_cksum, + sizeof (zio_cksum_t)); + vc_eck->zec_magic = BSWAP_64(sc_eck->zec_magic); + } + if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && + zhack_repair_test_cksum(byteswap, spa_config, sc_size, + sc_eck, base_offset + sc_offset, l) != 0) { + (void) fprintf(stderr, "It would appear checksums are " + "corrupted. Try zhack repair label -c \n"); + goto out; + } + + nvlist_t *cfg; + err = nvlist_unpack(vdev_config, vc_size - sizeof (zio_eck_t), &cfg, 0); + if (err) { + (void) fprintf(stderr, + "error: cannot unpack nvlist label %d\n", l); + return; + } + + ub = malloc(UBERBLOCK_SHIFT); + err = zhack_repair_read(fd, ub, UBERBLOCK_SHIFT, + label_offset + VDEV_LARGE_UBERBLOCK_RING, l); + if (err) + goto out; + + uint64_t ashift; + err = zhack_repair_get_ashift(cfg, l, &ashift); + if (err) + return; + + if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) { + char *buf; + size_t buflen; + + err = zhack_repair_undetach(ub, cfg, l); + if (err) + return; + + buf = vdev_config; + buflen = vc_size - sizeof (zio_eck_t); + if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) { + (void) fprintf(stderr, + "error: label %d: Failed to pack nvlist\n", l); + return; + } + + zhack_repair_write_uberblock_new(ub, l, ashift, fd, byteswap, + label_offset, labels_repaired); + } + + if (zhack_repair_write_label(l, fd, byteswap, toc_data, toc_eck, + base_offset, VDEV_TOC_SIZE)) + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + if (zhack_repair_write_label(l, fd, byteswap, bootenv, bootenv_eck, + base_offset + bootenv_offset, bootenv_size)) + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + if (zhack_repair_write_label(l, fd, byteswap, vdev_config, vc_eck, + base_offset + vc_offset, vc_size)) + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + if (zhack_repair_write_label(l, fd, byteswap, spa_config, sc_eck, + base_offset + sc_offset, sc_size)) + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + + fsync(fd); +out: + if (toc_data) + free(toc_data); + if (bootenv) + free(bootenv); + if (vdev_config) + free(vdev_config); + if (spa_config) + free(spa_config); + if (ub) + free(ub); +} + static const char * zhack_repair_label_status(const uint32_t label_status, const uint32_t to_check) @@ -1096,9 +1346,18 @@ zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv) filesize = (filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t); + boolean_t large_label = B_FALSE; for (int l = 0; l < VDEV_LABELS; l++) { zhack_repair_one_label(op, fd, &labels[l], - vdev_label_offset(filesize, l, 0), l, labels_repaired); + filesize, l, labels_repaired, &large_label); + if (large_label) + break; + } + if (large_label) { + for (int l = 0; l < VDEV_LABELS; l++) { + zhack_repair_one_label_large(op, fd, + filesize, l, labels_repaired); + } } close(fd); diff --git a/cmd/ztest.c b/cmd/ztest.c index 89752dcb0f0f..23d1fabeb7e3 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -6429,7 +6429,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) int iters = 1000; int maxfaults; int mirror_save; - vdev_t *vd0 = NULL; + vdev_t *vd0 = NULL, *vdrand = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; boolean_t injected = B_FALSE; @@ -6514,6 +6514,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; + vdrand = vdev_lookup_by_path(spa->spa_root_vdev, pathrand); /* * If the top-level vdev needs to be resilvered @@ -6559,7 +6560,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) (void) pthread_rwlock_unlock(&ztest_name_lock); goto out; } - vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; + vdrand = vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; guid0 = vd0->vdev_guid; (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); @@ -6665,7 +6666,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * odd label, so that we can handle crashes in the * middle of vdev_config_sync(). */ - if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) + if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE(vdrand)) continue; /* @@ -6673,10 +6674,11 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ - uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), - uint64_t); - if ((leaf & 1) == 1 && - offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) + uint64_t psize = P2ALIGN_TYPED(fsize, + vdrand->vdev_large_label ? VDEV_LARGE_LABEL_ALIGN : + sizeof (vdev_label_t), uint64_t); + if ((leaf & 1) == 1 && offset + sizeof (bad) > + psize - VDEV_LABEL_END_SIZE(vdrand)) continue; if (mirror_save != zs->zs_mirrors) { diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 662fd81c5ee1..443acf548979 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -875,6 +875,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" +#define ZPOOL_CONFIG_LARGE_LABEL "com.klarasystems:large_label" /* * The persistent vdev state is stored as separate values rather than a single diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 62b062984d36..e80859583008 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -234,6 +234,7 @@ struct spa { uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ + boolean_t spa_create_large_label_ok; boolean_t spa_trust_config; /* do we trust vdev tree? */ boolean_t spa_is_splitting; /* in the middle of a split? */ spa_config_source_t spa_config_source; /* where config comes from? */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 510474d6c085..aacbab15e0c5 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -218,13 +218,15 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, * Label routines */ struct uberblock; -extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); -extern int vdev_label_number(uint64_t psise, uint64_t offset); +extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset, + boolean_t new); +extern int vdev_label_number(uint64_t psise, uint64_t offset, boolean_t new); extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); -extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t - offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); +extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, boolean_t new, + abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, + void *priv, int flags); extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 5a8c2f846be2..eed0a2eba6cb 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -438,6 +438,19 @@ struct vdev { int64_t vdev_outlier_count; /* read outlier amongst peers */ hrtime_t vdev_read_sit_out_expire; /* end of sit out period */ list_node_t vdev_leaf_node; /* leaf vdev list */ + /* + * vdev_large_label has different meanings for leaf and non-leaf vdevs. + * For leaf vdevs, it is true if that specific vdev is using the large + * label format. For non-leaf vdevs, it is true if any of its children + * is using the new format, so that we know if we need to invoke the + * large label sync logic. + */ + boolean_t vdev_large_label; + + kmutex_t vdev_be_lock; + kcondvar_t vdev_be_cv; + abd_t *vdev_next_bootenv; + size_t vdev_bootenv_size; /* * For DTrace to work in userland (libzpool) context, these fields must @@ -479,6 +492,7 @@ struct vdev { #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) +#define VDEV_LARGE_UBERBLOCK_RING (128 << 20) // The last 128MiB /* * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock @@ -487,14 +501,22 @@ struct vdev { #define MMP_BLOCKS_PER_LABEL 1 /* The largest uberblock we support is 8k. */ -#define MAX_UBERBLOCK_SHIFT (13) +#define MAX_UBERBLOCK_SHIFT(new) ((new) ? 24 : 13) #define VDEV_UBERBLOCK_SHIFT(vd) \ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ - MAX_UBERBLOCK_SHIFT) -#define VDEV_UBERBLOCK_COUNT(vd) \ + MAX_UBERBLOCK_SHIFT((vd)->vdev_large_label)) +#define VDEV_UBERBLOCK_COUNT_OLD(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) -#define VDEV_UBERBLOCK_OFFSET(vd, n) \ +#define VDEV_UBERBLOCK_COUNT(vd) \ + (((vd)->vdev_large_label ? VDEV_LARGE_LABEL_SIZE - \ + VDEV_LARGE_UBERBLOCK_RING : \ + VDEV_UBERBLOCK_RING) >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET_OLD(vd, n) \ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + ((vd)->vdev_large_label ? (VDEV_LARGE_UBERBLOCK_RING + \ + ((n) << VDEV_UBERBLOCK_SHIFT(vd))) : \ + VDEV_UBERBLOCK_OFFSET_OLD(vd, n)) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { @@ -538,6 +560,93 @@ typedef struct vdev_label { char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ +/* + * The large label format was introduced to help future-proof ZFS as sector + * sizes grow. The number of uberblocks that can be safely written is limited + * by the size of the ring divided by the sector size, which in the original + * format was already getting uncomfortably small. The new label is only used + * on top-level vdevs and their children; l2arc and spare devices are excluded. + * + * Layout of the large label format: + * + * 16 MiB 112 MiB 128 MiB + * +---------+--------------------------------+-------------------------------+ + * | padding | Data (boot info, configs, | Uberblock ring | + * | | aux uberblocks) | | + * +---------+--------------------------------+-------------------------------+ + * + * The first thing in the Data section is a 32KiB sub-section for the Table + * of Contents (ToC). The ToC is an nvlist of nvlists; each nvlist pertains to + * a different sub-section in the Data region (boot info, vdev config, pool + * config, etc). The sub-nvlist contains relevant information, mostly offset + * and size. + * + * Currently, each sub-section is protected with an embedded checksum. In the + * event that a sub-section is larger than 16MiB, it will be split in + * 16MiB - sizeof (zio_eck_t) chunks, which will each have their own checksum. + * Future sub-sections may have their own checksum mechanisms (or none at all). + */ +#define VDEV_LARGE_PAD_SIZE (1 << 24) // 16MiB +#define VDEV_LARGE_DATA_SIZE ((1 << 27) - VDEV_LARGE_PAD_SIZE) +#define VDEV_LARGE_LABEL_SIZE (VDEV_LARGE_PAD_SIZE + VDEV_LARGE_DATA_SIZE + \ + VDEV_LARGE_UBERBLOCK_RING) // 256MiB per label +#define VDEV_LARGE_LABEL_ALIGN (1 << 24) // 16MiB + +#define VDEV_RESERVE_OFFSET (VDEV_LARGE_LABEL_SIZE * 2) +#define VDEV_RESERVE_SIZE (1 << 29) // 512MiB + +#define VDEV_TOC_SIZE (1 << 15) + +/* + * Each section in the label has its entry in the "sections" nvlist. This can + * store any necessary data, but will usually contain at least these two + * fields, representing the size and offset of the section. + */ +#define VDEV_SECTION_SIZE "section_size" +#define VDEV_SECTION_OFFSET "section_offset" + +/* + * While the data part of the TOC is always VDEV_TOC_SIZE, the actual write + * gets rounded up to the ashift. We don't know the ashift yet early in import, + * when we need to read this info. + */ +#define VDEV_TOC_TOC_SIZE "toc_size" +#define VDEV_TOC_SECTIONS "sections" + +/* The section that stores the boot region */ +#define VDEV_TOC_BOOT_REGION "boot_region" +/* The section that stores the vdev config */ +#define VDEV_TOC_VDEV_CONFIG "vdev_config" +/* The section that stores the pool config */ +#define VDEV_TOC_POOL_CONFIG "pool_config" + +static inline boolean_t +vdev_toc_get_secinfo(nvlist_t *toc, const char *section, uint32_t *size, + uint32_t *offset) +{ + nvlist_t *sections, *secinfo; + if (nvlist_lookup_nvlist(toc, VDEV_TOC_SECTIONS, §ions) != 0) + return (B_FALSE); + if (nvlist_lookup_nvlist(sections, section, &secinfo) != 0) + return (B_FALSE); + if (nvlist_lookup_uint32(secinfo, VDEV_SECTION_SIZE, size) != 0) + return (B_FALSE); + if (nvlist_lookup_uint32(secinfo, VDEV_SECTION_OFFSET, offset) != 0) + return (B_FALSE); + return (B_TRUE); +} + +static inline void +vdev_toc_add_secinfo(nvlist_t *sections, const char *section, uint32_t size, + uint32_t offset) +{ + nvlist_t *secinfo = fnvlist_alloc(); + fnvlist_add_uint32(secinfo, VDEV_SECTION_SIZE, size); + fnvlist_add_uint32(secinfo, VDEV_SECTION_OFFSET, offset); + fnvlist_add_nvlist(sections, section, secinfo); + fnvlist_free(secinfo); +} + /* * vdev_dirty() flags */ @@ -549,20 +658,31 @@ typedef struct vdev_label { /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. - * On RAIDZ, this space is overwritten during RAIDZ expansion. + * On RAIDZ, this space is overwritten durinvg RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. */ -#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) -#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_OLD_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + \ + VDEV_BOOT_SIZE) +#define VDEV_OLD_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) + +#define VDEV_LARGE_LABEL_START_SIZE (VDEV_RESERVE_OFFSET + \ + VDEV_RESERVE_SIZE) +#define VDEV_LARGE_LABEL_END_SIZE (2 * VDEV_LARGE_LABEL_SIZE) + +#define VDEV_LABEL_START_SIZE(vd) ((vd)->vdev_large_label ? \ + VDEV_LARGE_LABEL_START_SIZE : VDEV_OLD_LABEL_START_SIZE) +#define VDEV_LABEL_END_SIZE(vd) ((vd)->vdev_large_label ? \ + VDEV_LARGE_LABEL_END_SIZE : VDEV_OLD_LABEL_END_SIZE) + #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS -#define VDEV_OFFSET_IS_LABEL(vd, off) \ - (((off) < VDEV_LABEL_START_SIZE) || \ - ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) +#define VDEV_OFFSET_IS_LABEL(vd, off) \ + (((off) < VDEV_LABEL_START_SIZE(vd)) || \ + ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE(vd)))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 56382ca85b55..a5024e5d9a64 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -90,6 +90,7 @@ typedef enum spa_feature { SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURE_BLOCK_CLONING_ENDIAN, SPA_FEATURE_PHYSICAL_REWRITE, + SPA_FEATURE_LARGE_LABEL, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 184ea4a55b43..6c9f7a0b82e3 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -639,7 +639,7 @@ - + @@ -6402,7 +6402,8 @@ - + + @@ -9617,8 +9618,8 @@ - - + + @@ -9696,7 +9697,7 @@ - + diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 7f276e9592c9..a9d379579bae 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -208,7 +208,7 @@ zpool_clear_label(int fd) "label < l2arc_dev_hdr_phys_t"); memset(label, 0, sizeof (l2arc_dev_hdr_phys_t)); if (pwrite64(fd, label, sizeof (l2arc_dev_hdr_phys_t), - VDEV_LABEL_START_SIZE) == sizeof (l2arc_dev_hdr_phys_t)) + VDEV_OLD_LABEL_START_SIZE) == sizeof (l2arc_dev_hdr_phys_t)) header_cleared = B_TRUE; } diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index bbd1dafc69be..e3b85cbb9d62 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -465,7 +465,8 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) for (l = 0; l < VDEV_LABELS; l++) { cmds[l] = BIO_READ; vdev_lists[l] = kmem_alloc(size, KM_SLEEP); - offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + offsets[l] = vdev_label_offset(psize, l, 0, B_FALSE) + + VDEV_SKIP_SIZE; sizes[l] = size; errors[l] = 0; ASSERT0(offsets[l] % pp->sectorsize); diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c index 11e93b800a54..a080a6abc499 100644 --- a/module/os/freebsd/zfs/vdev_label_os.c +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -61,7 +61,7 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) retry: zio = zio_root(spa, NULL, NULL, flags); - vdev_label_write(zio, vd, 0, pad2, + vdev_label_write(zio, vd, 0, B_FALSE, pad2, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); error = zio_wait(zio); @@ -107,8 +107,9 @@ vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd) * offset lets us access the boot area. */ zio_nowait(zio_vdev_child_io(pio, NULL, childvd, - VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abd, size, ZIO_TYPE_READ, - ZIO_PRIORITY_ASYNC_READ, 0, vdev_child_done, pio)); + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE(childvd), + abd, size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, 0, + vdev_child_done, pio)); zio_wait(pio); unsigned char *buf = abd_to_buf(abd); diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6ba9892eeb64..b3668bc0f573 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -810,6 +810,18 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, physical_rewrite_deps, sfeatures); } + { + static const spa_feature_t large_label_deps[] = { + SPA_FEATURE_LARGE_BLOCKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_LABEL, + "com.klarasystems:large_label", "large_label", + "Support for larger label format.", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, large_label_deps, + sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index bd6dc8edd8ca..d4fb79dc3c4a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6333,9 +6333,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd = hdr_abd; } - ASSERT(addr >= VDEV_LABEL_START_SIZE && + ASSERT(addr >= VDEV_LABEL_START_SIZE(vd) && addr + asize <= vd->vdev_psize - - VDEV_LABEL_END_SIZE); + VDEV_LABEL_END_SIZE(vd)); /* * l2arc read. The SCL_L2ARC lock will be @@ -9170,8 +9170,8 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); - vdev_trim_simple(vd, - dev->l2ad_evict - VDEV_LABEL_START_SIZE, + vdev_trim_simple(vd, dev->l2ad_evict - + VDEV_LABEL_START_SIZE(vd), taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); @@ -9977,8 +9977,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); - adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; - adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + adddev->l2ad_start = VDEV_LABEL_START_SIZE(vd) + l2dhdr_asize; + adddev->l2ad_end = VDEV_LABEL_START_SIZE(vd) + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; @@ -10536,15 +10536,18 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; + vdev_t *vd = dev->l2ad_vdev; + uint64_t offset = vd ? VDEV_LABEL_START_SIZE(vd) : + VDEV_OLD_LABEL_START_SIZE; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); - err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, - VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, - ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, offset, + l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); @@ -10897,6 +10900,9 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; + vdev_t *vd = dev->l2ad_vdev; + uint64_t offset = vd ? VDEV_LABEL_START_SIZE(vd) : + VDEV_OLD_LABEL_START_SIZE; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); @@ -10919,8 +10925,8 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, - VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, - NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); + offset, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_free(abd); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 7db72b9b04b0..4487d2f00952 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -530,7 +530,7 @@ mmp_write_uberblock(spa_t *spa) MMP_BLOCKS_PER_LABEL + random_in_range(MMP_BLOCKS_PER_LABEL)); label = random_in_range(VDEV_LABELS); - vdev_label_write(zio, vd, label, ub_abd, offset, + vdev_label_write(zio, vd, label, vd->vdev_large_label, ub_abd, offset, VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, flags | ZIO_FLAG_DONT_PROPAGATE); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b3bb46da263b..b9d157f48ff4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -973,7 +973,7 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) } /* Save time if the version is already set. */ - if (ver == spa_version(spa)) + if (spa_version(spa) >= ver) continue; /* @@ -6555,6 +6555,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; + boolean_t has_large_label; spa_feature_t feat; const char *feat_name; const char *poolname; @@ -6601,6 +6602,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; + has_large_label = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { @@ -6612,6 +6614,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; + if (feat == SPA_FEATURE_LARGE_LABEL) + has_large_label = B_TRUE; } } @@ -6647,6 +6651,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; + spa->spa_create_large_label_ok = has_large_label; /* * Create "The Godfather" zio to hold all async IOs @@ -9811,7 +9816,7 @@ spa_sync_version(void *arg, dmu_tx_t *tx) ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); - ASSERT(version >= spa_version(spa)); + ASSERT3U(version, >=, spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); @@ -10079,6 +10084,13 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) spa->spa_cksum_salt.zcs_bytes, tx)); } + if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES && + spa->spa_root_vdev->vdev_large_label && + spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_LABEL) && + !spa_feature_is_active(spa, SPA_FEATURE_LARGE_LABEL)) { + spa_feature_incr(spa, SPA_FEATURE_LARGE_LABEL, tx); + } + rrw_exit(&dp->dp_config_rwlock, FTAG); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 654e034de9e1..e31d97067ab3 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -100,6 +100,8 @@ static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ static uint_t zfs_vdev_max_ms_shift = 34; +uint64_t zfs_vdev_large_label_min_size = 1ULL << 40; + static int vdev_validate_skip = B_FALSE; /* @@ -579,12 +581,15 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT0P(cvd->vdev_top->vdev_parent->vdev_parent); + pvd->vdev_large_label |= cvd->vdev_large_label; /* * Walk up all ancestors to update guid sum. */ - for (; pvd != NULL; pvd = pvd->vdev_parent) + for (; pvd != NULL; pvd = pvd->vdev_parent) { + pvd->vdev_large_label |= cvd->vdev_large_label; pvd->vdev_guid_sum += cvd->vdev_guid_sum; + } if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); @@ -769,6 +774,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_be_lock, NULL, MUTEX_NOLOCKDEP, NULL); + cv_init(&vd->vdev_be_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = zfs_range_tree_create_flags( NULL, ZFS_RANGE_SEG64, NULL, 0, 0, @@ -1090,6 +1097,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_autosit = vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); + if (vd->vdev_ops->vdev_op_leaf) { + (void) nvlist_lookup_boolean_value(nv, ZPOOL_CONFIG_LARGE_LABEL, + &vd->vdev_large_label); + } + /* * Add ourselves to the parent's list of children. */ @@ -1244,6 +1256,9 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); + mutex_destroy(&vd->vdev_be_lock); + cv_destroy(&vd->vdev_be_cv); + zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); @@ -1922,10 +1937,18 @@ vdev_probe(vdev_t *vd, zio_t *zio) } for (int l = 1; l < VDEV_LABELS; l++) { + size_t size; + offset_t offset; + if (vd->vdev_large_label) { + size = P2ROUNDUP(VDEV_TOC_SIZE, 1 << vd->vdev_ashift); + offset = VDEV_LARGE_PAD_SIZE; + } else { + size = VDEV_PAD_SIZE; + offset = offsetof(vdev_label_t, vl_be); + } zio_nowait(zio_read_phys(pio, vd, - vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, - abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), + vdev_label_offset(vd->vdev_psize, l, offset, + vd->vdev_large_label), size, abd_alloc_for_io(size, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } @@ -2235,22 +2258,38 @@ vdev_open(vdev_t *vd) } } - osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); - max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); - + boolean_t large_label; + if (spa_version(spa) < SPA_VERSION_FEATURES || + !(spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_LABEL) || + spa->spa_create_large_label_ok)) + large_label = B_FALSE; + else if (vd->vdev_asize == 0 && vd->vdev_ops->vdev_op_leaf) + large_label = osize > zfs_vdev_large_label_min_size; + else + large_label = vd->vdev_large_label; + uint64_t align; + if (large_label) + align = VDEV_LARGE_LABEL_ALIGN; + else + align = sizeof (vdev_label_t); + osize = P2ALIGN_TYPED(osize, align, uint64_t); + max_osize = P2ALIGN_TYPED(max_osize, align, uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } + uint64_t ssize = large_label ? VDEV_LARGE_LABEL_START_SIZE : + VDEV_OLD_LABEL_START_SIZE; + uint64_t esize = large_label ? VDEV_LARGE_LABEL_END_SIZE : + VDEV_OLD_LABEL_END_SIZE; psize = osize; - asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); - max_asize = max_osize - (VDEV_LABEL_START_SIZE + - VDEV_LABEL_END_SIZE); + asize = osize - (ssize + esize); + max_asize = max_osize - (ssize + esize); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { + (VDEV_OLD_LABEL_START_SIZE + VDEV_OLD_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); @@ -2298,6 +2337,10 @@ vdev_open(vdev_t *vd) vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; + if (vd->vdev_ops->vdev_op_leaf && large_label) { + vd->vdev_large_label = B_TRUE; + } + /* * If the vdev_ashift was not overridden at creation time * (0) or the override value is impossible for the device, @@ -2386,6 +2429,13 @@ vdev_open(vdev_t *vd) if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); + if (!vd->vdev_ops->vdev_op_leaf) { + for (int c = 0; c < vd->vdev_children; c++) { + vd->vdev_large_label |= + vd->vdev_child[c]->vdev_large_label; + } + } + return (0); } @@ -4904,8 +4954,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; - vs->vs_rsize += VDEV_LABEL_START_SIZE + - VDEV_LABEL_END_SIZE; + vs->vs_rsize += VDEV_LABEL_START_SIZE(vd) + + VDEV_LABEL_END_SIZE(vd); /* * Report initializing progress. Since we don't * have the initializing locks held, this is only @@ -6783,6 +6833,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, large_label_min_size, U64, ZMOD_RW, + "Minimum size for a disk to use the new large label format"); + ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 8588cfee3f7d..5d84930d1753 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -2508,8 +2508,9 @@ vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, vdev_draid_calculate_asize(tvd, &asize, &max_asize, logical_ashift, physical_ashift); - *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *psize = asize + VDEV_LABEL_START_SIZE(vd) + VDEV_LABEL_END_SIZE(vd); + *max_psize = max_asize + VDEV_LABEL_START_SIZE(vd) + + VDEV_LABEL_END_SIZE(vd); vds->vds_draid_vdev = tvd; vd->vdev_nonrot = tvd->vdev_nonrot; @@ -2611,7 +2612,7 @@ vdev_draid_spare_io_start(zio_t *zio) { vdev_t *cvd = NULL, *vd = zio->io_vd; vdev_draid_spare_t *vds = vd->vdev_tsd; - uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; + uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE(vd); /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 7538f471e63c..10af2586831a 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -952,7 +952,7 @@ vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + - VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + VDEV_LABEL_START_SIZE(vd) + VDEV_LABEL_END_SIZE(vd); *logical_ashift = vd->vdev_ashift; *physical_ashift = vd->vdev_physical_ashift; return (0); diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 27188c46e561..926fa3262eea 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -348,7 +348,8 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) int error; error = vdev_initialize_write(vd, - VDEV_LABEL_START_SIZE + zfs_rs_get_start(rs, rt) + + VDEV_LABEL_START_SIZE(vd) + + zfs_rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 0d4fdaa77ba0..5a8a2201b58d 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -160,34 +160,43 @@ * Used throughout the rest of this file. */ uint64_t -vdev_label_offset(uint64_t psize, int l, uint64_t offset) +vdev_label_offset(uint64_t psize, int l, uint64_t offset, boolean_t large_label) { - ASSERT(offset < sizeof (vdev_label_t)); - ASSERT0(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t)); + uint64_t lsize = large_label ? VDEV_LARGE_LABEL_SIZE : + sizeof (vdev_label_t); + uint64_t align = large_label ? VDEV_LARGE_LABEL_ALIGN : + sizeof (vdev_label_t); - return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? - 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); + ASSERT(offset < lsize); + ASSERT0(P2PHASE_TYPED(psize, align, uint64_t)); + + return (offset + l * lsize + (l < VDEV_LABELS / 2 ? + 0 : psize - VDEV_LABELS * lsize)); } /* * Returns back the vdev label associated with the passed in offset. */ int -vdev_label_number(uint64_t psize, uint64_t offset) +vdev_label_number(uint64_t psize, uint64_t offset, boolean_t new) { int l; + uint64_t lsize = new ? VDEV_LARGE_LABEL_SIZE : sizeof (vdev_label_t); + uint64_t esize = new ? VDEV_LARGE_LABEL_END_SIZE : + VDEV_OLD_LABEL_END_SIZE; - if (offset >= psize - VDEV_LABEL_END_SIZE) { - offset -= psize - VDEV_LABEL_END_SIZE; - offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); + if (offset >= psize - esize) { + offset -= psize - esize; + offset += (VDEV_LABELS / 2) * lsize; } - l = offset / sizeof (vdev_label_t); + l = offset / lsize; return (l < VDEV_LABELS ? l : -1); } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) +vdev_label_read(zio_t *zio, vdev_t *vd, int l, boolean_t large_label, + abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, + void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || @@ -195,14 +204,15 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, - vdev_label_offset(vd->vdev_psize, l, offset), + vdev_label_offset(vd->vdev_psize, l, offset, large_label), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); } void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) +vdev_label_write(zio_t *zio, vdev_t *vd, int l, boolean_t large_label, + abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, + void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || @@ -210,7 +220,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, - vdev_label_offset(vd->vdev_psize, l, offset), + vdev_label_offset(vd->vdev_psize, l, offset, large_label), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } @@ -502,6 +512,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (flags & VDEV_CONFIG_L2CACHE) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + if (vd->vdev_ops->vdev_op_leaf) + fnvlist_add_boolean_value(nv, ZPOOL_CONFIG_LARGE_LABEL, + vd->vdev_large_label); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, @@ -796,6 +810,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) vdev_phys_t *vp[VDEV_LABELS]; abd_t *vp_abd[VDEV_LABELS]; zio_t *zio[VDEV_LABELS]; + uint64_t vp_off[VDEV_LABELS]; + uint32_t vp_size[VDEV_LABELS]; uint64_t best_txg = 0; uint64_t label_txg = 0; int error = 0; @@ -817,23 +833,56 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) return (vdev_draid_read_config_spare(vd)); for (int l = 0; l < VDEV_LABELS; l++) { - vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp_abd[l] = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); vp[l] = abd_to_buf(vp_abd[l]); + vp_off[l] = 0; + vp_size[l] = 0; } retry: + if (vd->vdev_large_label) { + size_t toc_size = P2ROUNDUP(VDEV_TOC_SIZE, + 1 << vd->vdev_ashift); + for (int l = 0; l < VDEV_LABELS; l++) { + zio[l] = zio_root(spa, NULL, NULL, flags); + vdev_label_read(zio[l], vd, l, vd->vdev_large_label, + vp_abd[l], VDEV_LARGE_PAD_SIZE, + toc_size, NULL, NULL, flags); + } + for (int l = 0; l < VDEV_LABELS; l++) { + nvlist_t *toc = NULL; + + if (zio_wait(zio[l]) == 0 && nvlist_unpack( + (char *)vp[l], VDEV_TOC_SIZE, &toc, 0) == 0) { + uint32_t off; + if (!vdev_toc_get_secinfo(toc, + VDEV_TOC_VDEV_CONFIG, + &vp_size[l], &off)) + continue; + vp_off[l] = VDEV_LARGE_PAD_SIZE + off; + fnvlist_free(toc); + } + } + } else { + for (int l = 0; l < VDEV_LABELS; l++) { + vp_off[l] = offsetof(vdev_label_t, vl_vdev_phys); + vp_size[l] = sizeof (vdev_phys_t); + } + } for (int l = 0; l < VDEV_LABELS; l++) { + if (vp_off[l] == 0) + continue; zio[l] = zio_root(spa, NULL, NULL, flags); - - vdev_label_read(zio[l], vd, l, vp_abd[l], - offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), - NULL, NULL, flags); + vdev_label_read(zio[l], vd, l, vd->vdev_large_label, vp_abd[l], + vp_off[l], vp_size[l], NULL, NULL, flags); } for (int l = 0; l < VDEV_LABELS; l++) { + if (vp_off[l] == 0) + continue; nvlist_t *label = NULL; if (zio_wait(zio[l]) == 0 && - nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist), + nvlist_unpack(vp[l]->vp_nvlist, vp_size[l], &label, 0) == 0) { /* * Auxiliary vdevs won't have txg values in their @@ -1053,6 +1102,10 @@ vdev_aux_label_generate(vdev_t *vd, boolean_t reason_spare) fnvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); } + if (vd->vdev_large_label) { + fnvlist_add_boolean_value(label, ZPOOL_CONFIG_LARGE_LABEL, + vd->vdev_large_label); + } return (label); } @@ -1074,6 +1127,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; + abd_t *sc_abd = NULL; + abd_t *toc_abd = NULL; + abd_t *ub_abd2 = NULL; zio_t *zio; char *buf; size_t buflen; @@ -1159,8 +1215,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); - abd_zero(vp_abd, sizeof (vdev_phys_t)); + size_t vp_size = P2ROUNDUP(sizeof (vdev_phys_t), 1 << vd->vdev_ashift); + vp_abd = abd_alloc_linear(vp_size, B_TRUE); + abd_zero(vp_abd, vp_size); vp = abd_to_buf(vp_abd); /* @@ -1228,7 +1285,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) for (int l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp_abd, + vdev_label_write(zio, vd, l, B_FALSE, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -1237,15 +1294,80 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, bootenv, + vdev_label_write(zio, vd, l, B_FALSE, bootenv, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub_abd, + vdev_label_write(zio, vd, l, B_FALSE, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } + if (vd->vdev_large_label) { + nvlist_t *toc = fnvlist_alloc(); + size_t toc_buflen = VDEV_TOC_SIZE; + size_t writesize = P2ROUNDUP(toc_buflen, 1 << vd->vdev_ashift); + nvlist_t *spa_config = spa_config_generate(vd->vdev_spa, + spa->spa_root_vdev, crtxg, 0); + size_t sc_buflen = 0; + VERIFY0(nvlist_size(spa_config, &sc_buflen, NV_ENCODE_XDR)); + sc_buflen = P2ROUNDUP(sc_buflen + sizeof (zio_eck_t), + 1 << vd->vdev_ashift); + if (sc_abd == NULL) + sc_abd = abd_alloc_linear(sc_buflen, B_TRUE); + char *sc_buf = abd_to_buf(sc_abd); + VERIFY0(nvlist_pack(spa_config, &sc_buf, &sc_buflen, + NV_ENCODE_XDR, KM_SLEEP)); + fnvlist_free(spa_config); + fnvlist_add_uint32(toc, VDEV_TOC_TOC_SIZE, writesize); + + nvlist_t *sections = fnvlist_alloc(); + vdev_toc_add_secinfo(sections, VDEV_TOC_VDEV_CONFIG, vp_size, + writesize); + vdev_toc_add_secinfo(sections, VDEV_TOC_POOL_CONFIG, sc_buflen, + writesize + vp_size); + fnvlist_add_nvlist(toc, VDEV_TOC_SECTIONS, sections); + fnvlist_free(sections); + + if (ub_abd2 == NULL) + ub_abd2 = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); + abd_zero(ub_abd2, SPA_MAXBLOCKSIZE); + + + toc_abd = abd_alloc_linear(toc_buflen, B_TRUE); + char *buf = abd_to_buf(toc_abd); + VERIFY0(nvlist_pack(toc, &buf, &toc_buflen, NV_ENCODE_XDR, + KM_SLEEP)); + fnvlist_free(toc); + for (int l = 0; l < VDEV_LABELS; l++) { + uint64_t offset = VDEV_LARGE_PAD_SIZE; + vdev_label_write(zio, vd, l, B_TRUE, toc_abd, offset, + toc_buflen, NULL, NULL, flags); + offset += writesize; + vdev_label_write(zio, vd, l, B_TRUE, vp_abd, offset, + vp_size, NULL, NULL, flags); + offset += vp_size; + vdev_label_write(zio, vd, l, B_TRUE, sc_abd, offset, + sc_buflen, NULL, NULL, flags); + + abd_copy_from_buf(ub_abd2, &spa->spa_uberblock, + sizeof (uberblock_t)); + + for (int u = 0; + u < VDEV_LARGE_UBERBLOCK_RING / SPA_MAXBLOCKSIZE; + u++) { + vdev_label_write(zio, vd, l, B_TRUE, ub_abd2, + VDEV_LARGE_UBERBLOCK_RING + + u * SPA_MAXBLOCKSIZE, SPA_MAXBLOCKSIZE, + NULL, NULL, flags); + if (u == 0) { + abd_zero_off(ub_abd2, 0, + sizeof (uberblock_t)); + } + } + } + } + error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { @@ -1256,7 +1378,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) nvlist_free(label); abd_free(bootenv); abd_free(ub_abd); + abd_free(ub_abd2); abd_free(vp_abd); + abd_free(toc_abd); + abd_free(sc_abd); /* * If this vdev hasn't been previously identified as a spare, then we @@ -1310,15 +1435,53 @@ vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) for (int c = 0; c < vd->vdev_children; c++) vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); + if (!(vd->vdev_ops->vdev_op_leaf && vdev_readable(vd))) + return; /* * We just use the first label that has a correct checksum; the * bootloader should have rewritten them all to be the same on boot, * and any changes we made since boot have been the same across all * labels. */ - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_large_label) { + zio_t *rios[VDEV_LABELS]; + size_t toc_size = P2ROUNDUP(VDEV_TOC_SIZE, + 1 << vd->vdev_ashift); + abd_t *toc_abds[VDEV_LABELS]; + for (int l = 0; l < VDEV_LABELS; l++) { + rios[l] = zio_root(zio->io_spa, NULL, NULL, flags); + toc_abds[l] = abd_alloc_linear(toc_size, B_FALSE); + vdev_label_read(rios[l], vd, l, B_TRUE, toc_abds[l], + VDEV_LARGE_PAD_SIZE, toc_size, NULL, NULL, flags); + } for (int l = 0; l < VDEV_LABELS; l++) { - vdev_label_read(zio, vd, l, + int err = zio_wait(rios[l]); + if (err != 0) { + abd_free(toc_abds[l]); + continue; + } + nvlist_t *toc; + if (!nvlist_unpack(abd_to_buf(toc_abds[l]), toc_size, + &toc, KM_SLEEP)) { + abd_free(toc_abds[l]); + continue; + } + abd_free(toc_abds[l]); + uint32_t bootenv_size, bootenv_offset; + if (!vdev_toc_get_secinfo(toc, VDEV_TOC_BOOT_REGION, + &bootenv_size, &bootenv_offset)) { + fnvlist_free(toc); + continue; + } + + vdev_label_read(zio, vd, l, B_FALSE, + abd_alloc_linear(bootenv_size, B_FALSE), + VDEV_LARGE_PAD_SIZE + bootenv_offset, bootenv_size, + vdev_label_read_bootenv_done, zio, flags); + } + } else { + for (int l = 0; l < VDEV_LABELS; l++) { + vdev_label_read(zio, vd, l, B_FALSE, abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, vdev_label_read_bootenv_done, zio, flags); @@ -1431,13 +1594,30 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) !vdev_writeable(vd)) { return (error); } - ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); - abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(abd, VDEV_PAD_SIZE); + size_t content_size; + if (vd->vdev_large_label) { + switch (fnvlist_lookup_uint64(env, BOOTENV_VERSION)) { + case VB_RAW: + if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) + content_size = strlen(tmp); + break; + case VB_NVLIST: + VERIFY0(nvlist_size(env, &content_size, NV_ENCODE_XDR)); + break; + default: + return (EINVAL); + } + } + + uint32_t new_abd_size = (vd->vdev_large_label ? + P2ROUNDUP(content_size + sizeof (uint64_t) + sizeof (zio_eck_t), + 1 << vd->vdev_ashift) : VDEV_PAD_SIZE); + abd_t *abd = abd_alloc_for_io(new_abd_size, B_TRUE); + abd_zero(abd, new_abd_size); - bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); + bootenv = abd_borrow_buf_copy(abd, new_abd_size); nvbuf = bootenv->vbe_bootenv; - nvsize = sizeof (bootenv->vbe_bootenv); + nvsize = new_abd_size - sizeof (zio_eck_t) - sizeof (uint64_t); bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); switch (bootenv->vbe_version) { @@ -1460,7 +1640,7 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) if (error == 0) { bootenv->vbe_version = htonll(bootenv->vbe_version); - abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + abd_return_buf_copy(abd, bootenv, new_abd_size); } else { abd_free(abd); return (SET_ERROR(error)); @@ -1468,13 +1648,75 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) retry: zio = zio_root(spa, NULL, NULL, flags); - for (int l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, abd, - offsetof(vdev_label_t, vl_be), - VDEV_PAD_SIZE, NULL, NULL, flags); + int err; + if (vd->vdev_large_label) { + zio_t *rios[VDEV_LABELS]; + size_t toc_size = P2ROUNDUP(VDEV_TOC_SIZE, + 1 << vd->vdev_ashift); + abd_t *toc_abds[VDEV_LABELS]; + for (int l = 0; l < VDEV_LABELS; l++) { + rios[l] = zio_root(spa, NULL, NULL, + flags | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD); + toc_abds[l] = abd_alloc_linear(toc_size, B_FALSE); + vdev_label_read(rios[l], vd, l, B_TRUE, toc_abds[l], + VDEV_LARGE_PAD_SIZE, toc_size, NULL, NULL, + flags | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD); + } + boolean_t all_writeable = B_TRUE; + for (int l = 0; l < VDEV_LABELS; l++) { + error = zio_wait(rios[l]); + if (error != 0) { + all_writeable = B_FALSE; + continue; + } + nvlist_t *toc; + uint32_t bootenv_size, bootenv_offset; + if (!(error = nvlist_unpack(abd_to_buf(toc_abds[l]), + toc_size, &toc, KM_SLEEP))) { + all_writeable = B_FALSE; + continue; + } + if (!vdev_toc_get_secinfo(toc, VDEV_TOC_BOOT_REGION, + &bootenv_size, &bootenv_offset)) { + fnvlist_free(toc); + all_writeable = B_FALSE; + continue; + } + fnvlist_free(toc); + + if (new_abd_size == bootenv_size) { + vdev_label_write(zio, vd, l, B_TRUE, abd, + VDEV_LARGE_PAD_SIZE + bootenv_offset, + new_abd_size, NULL, NULL, flags); + } else { + all_writeable = B_FALSE; + } + } + for (int l = 0; l < VDEV_LABELS; l++) + abd_free(toc_abds[l]); + err = zio_wait(zio); + if (!all_writeable) { + mutex_enter(&vd->vdev_be_lock); + vd->vdev_next_bootenv = abd; + vdev_config_dirty(vd); + txg_kick(spa_get_dsl(spa), + spa_syncing_txg(spa) + TXG_SIZE); + (void) spa_vdev_state_exit(spa, NULL, 0); + cv_wait(&vd->vdev_be_cv, &vd->vdev_be_lock); + mutex_exit(&vd->vdev_be_lock); + spa_vdev_state_enter(spa, SCL_ALL); + } + } else { + for (int l = 0; l < VDEV_LABELS; l++) { + vdev_label_write(zio, vd, l, B_FALSE, abd, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + } + err = zio_wait(zio); } - error = zio_wait(zio); + if (error == 0) + error = err; if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; @@ -1575,21 +1817,30 @@ vdev_uberblock_load_done(zio_t *zio) } static void -vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, - struct ubl_cbdata *cbp) +vdev_uberblock_load_impl(zio_t **zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp, uint32_t *ios) { for (int c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp, + ios); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_read(zio, vd, l, + (*ios)++; + if (*ios > 1 << 16) { + (void) zio_wait(*zio); + *zio = zio_root(vd->vdev_spa, NULL, cbp, + flags); + *ios = 1; + } + vdev_label_read(*zio, vd, l, + vd->vdev_large_label, abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_load_done, zio, flags); + vdev_uberblock_load_done, *zio, flags); } } } @@ -1621,7 +1872,8 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); - vdev_uberblock_load_impl(zio, rvd, flags, &cb); + uint32_t ios = 0; + vdev_uberblock_load_impl(&zio, rvd, flags, &cb, &ios); (void) zio_wait(zio); /* @@ -1682,6 +1934,7 @@ vdev_copy_uberblocks(vdev_t *vd) int locks = (SCL_L2ARC | SCL_ZIO); int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + boolean_t new = vd->vdev_large_label; ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) == SCL_STATE); @@ -1704,7 +1957,7 @@ vdev_copy_uberblocks(vdev_t *vd) zio_t *zio; zio = zio_root(vd->vdev_spa, NULL, NULL, flags); - vdev_label_read(zio, vd, src_label, ub_abd, + vdev_label_read(zio, vd, src_label, new, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); @@ -1712,7 +1965,7 @@ vdev_copy_uberblocks(vdev_t *vd) abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); for (int l = 2; l < VDEV_LABELS; l++) - vdev_label_write(write_zio, vd, l, ub_abd, + vdev_label_write(write_zio, vd, l, new, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags | ZIO_FLAG_DONT_PROPAGATE); @@ -1795,11 +2048,21 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, abd_zero_off(ub_abd, sizeof (uberblock_t), VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t)); - for (int l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ub_abd, + for (int l = 0; l < VDEV_LABELS; l++) { + if (vd->vdev_large_label && l < VDEV_LABELS / 2) { + int old_n = (ub->ub_txg - + (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) % + (VDEV_UBERBLOCK_COUNT_OLD(vd) - m); + vdev_label_write(zio, vd, l, B_FALSE, ub_abd, + VDEV_UBERBLOCK_OFFSET_OLD(vd, old_n), + VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + vdev_label_write(zio, vd, l, vd->vdev_large_label, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); + } abd_free(ub_abd); } @@ -1895,12 +2158,88 @@ vdev_label_sync_ignore_done(zio_t *zio) kmem_free(zio->io_private, sizeof (uint64_t)); } +static void +vdev_label_sync_large(vdev_t *vd, zio_t *zio, uint64_t *good_writes, + int l, int flags, abd_t *sc_abd, abd_t *vp_abd) +{ + ASSERT(vd->vdev_ops->vdev_op_leaf); + if (!vd->vdev_large_label) + return; + + ASSERT(vp_abd); + ASSERT(sc_abd); + size_t bootenv_size = 0; + abd_t *bootenv_abd = NULL; + if (vd->vdev_next_bootenv) { + bootenv_abd = vd->vdev_next_bootenv; + ASSERT(bootenv_abd); + bootenv_size = abd_get_size(bootenv_abd); + } else { + bootenv_size = vd->vdev_bootenv_size; + } + uint_t vdev_config_size = abd_get_size(vp_abd); + uint_t pool_config_size = abd_get_size(sc_abd); + size_t toc_buflen = VDEV_TOC_SIZE; + size_t writesize = P2ROUNDUP(toc_buflen, 1 << vd->vdev_ashift); + nvlist_t *toc = fnvlist_alloc(); + fnvlist_add_uint32(toc, VDEV_TOC_TOC_SIZE, writesize); + + nvlist_t *sections = fnvlist_alloc(); + if (bootenv_size != 0) { + vdev_toc_add_secinfo(sections, VDEV_TOC_BOOT_REGION, + bootenv_size, writesize); + } + vdev_toc_add_secinfo(sections, VDEV_TOC_VDEV_CONFIG, vdev_config_size, + writesize + bootenv_size); + vdev_toc_add_secinfo(sections, VDEV_TOC_POOL_CONFIG, pool_config_size, + writesize + bootenv_size + vdev_config_size); + fnvlist_add_nvlist(toc, VDEV_TOC_SECTIONS, sections); + fnvlist_free(sections); + + ASSERT3U(fnvlist_size(toc) + sizeof (zio_eck_t), <=, VDEV_TOC_SIZE); + + abd_t *toc_abd; + char *buf; + toc_abd = abd_alloc_linear(toc_buflen, B_TRUE); + buf = abd_to_buf(toc_abd); + + if (nvlist_pack(toc, &buf, &toc_buflen, NV_ENCODE_XDR, KM_SLEEP)) { + abd_free(toc_abd); + fnvlist_free(toc); + return; + } + + for (; l < VDEV_LABELS; l += 2) { + uint64_t offset = VDEV_LARGE_PAD_SIZE; + vdev_label_write(zio, vd, l, B_TRUE, toc_abd, offset, + toc_buflen, vdev_label_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + offset += writesize; + if (bootenv_abd) { + vdev_label_write(zio, vd, l, B_TRUE, bootenv_abd, + offset, bootenv_size, vdev_label_sync_done, + good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); + vd->vdev_bootenv_size = bootenv_size; + } + offset += bootenv_size; + vdev_label_write(zio, vd, l, B_TRUE, vp_abd, offset, + vdev_config_size, vdev_label_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + offset += vdev_config_size; + vdev_label_write(zio, vd, l, B_TRUE, sc_abd, offset, + pool_config_size, vdev_label_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + + abd_free(toc_abd); +} + /* * Write all even or odd labels to all leaves of the specified vdev. */ static void vdev_label_sync(zio_t *zio, uint64_t *good_writes, - vdev_t *vd, int l, uint64_t txg, int flags) + vdev_t *vd, int l, uint64_t txg, int flags, abd_t *sc_abd) { nvlist_t *label; vdev_phys_t *vp; @@ -1912,7 +2251,7 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, for (int c = 0; c < vd->vdev_children; c++) { vdev_label_sync(zio, good_writes, - vd->vdev_child[c], l, txg, flags); + vd->vdev_child[c], l, txg, flags, sc_abd); } if (!vd->vdev_ops->vdev_op_leaf) @@ -1949,8 +2288,11 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { - for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp_abd, + vdev_label_sync_large(vd, zio, good_writes, + l, flags, sc_abd, vp_abd); + for (; l < VDEV_LABELS && + !(vd->vdev_large_label && l >= (VDEV_LABELS / 2)); l += 2) { + vdev_label_write(zio, vd, l, B_FALSE, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, good_writes, @@ -1963,7 +2305,7 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, } static int -vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) +vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags, abd_t *sc_abd) { list_t *dl = &spa->spa_config_dirty_list; vdev_t *vd; @@ -1985,7 +2327,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); - vdev_label_sync(vio, good_writes, vd, l, txg, flags); + vdev_label_sync(vio, good_writes, vd, l, txg, flags, sc_abd); zio_nowait(vio); } @@ -2002,12 +2344,22 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) zio_t *vio = zio_null(zio, spa, NULL, vdev_label_sync_ignore_done, good_writes, flags); vdev_label_sync(vio, good_writes, sav[i]->sav_vdevs[v], - l, txg, flags); + l, txg, flags, sc_abd); zio_nowait(vio); } } error = zio_wait(zio); + if (error == 0 && l == 1) { + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { + mutex_enter(&vd->vdev_be_lock); + if (vd->vdev_next_bootenv) { + cv_broadcast(&vd->vdev_be_cv); + vd->vdev_next_bootenv = NULL; + } + mutex_exit(&vd->vdev_be_lock); + } + } /* * Flush the new labels to disk. @@ -2103,6 +2455,43 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) (void) zio_wait(zio); + boolean_t large_label = spa->spa_root_vdev->vdev_large_label; + spa_aux_vdev_t *sav[2] = {&spa->spa_spares, &spa->spa_l2cache}; + for (int i = 0; i < 2; i++) { + if (!sav[i]->sav_label_sync) + continue; + for (int v = 0; !large_label && v < sav[i]->sav_count; v++) { + large_label |= sav[i]->sav_vdevs[v]->vdev_large_label; + } + } + + abd_t *sc_abd = NULL; + if (large_label) { + nvlist_t *spa_config = spa_config_generate(spa, + spa->spa_root_vdev, txg, B_FALSE); + size_t sc_buflen = 0; + VERIFY0(nvlist_size(spa_config, &sc_buflen, NV_ENCODE_XDR)); + uint8_t ashift = spa->spa_min_ashift == INT_MAX ? ASHIFT_MAX : + spa->spa_min_ashift; + sc_buflen = P2ROUNDUP(sc_buflen + sizeof (zio_eck_t), + 1 << ashift); + sc_abd = abd_alloc_linear(sc_buflen, B_TRUE); + char *sc_buf = abd_to_buf(sc_abd); + if (error || (error = nvlist_pack(spa_config, &sc_buf, + &sc_buflen, NV_ENCODE_XDR, KM_SLEEP)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_config() failed to pack " + "nvlist with error %d for pool '%s' when " + "syncing out labels of dirty vdevs", error, + spa_name(spa)); + } + abd_free(sc_abd); + fnvlist_free(spa_config); + goto retry; + } + fnvlist_free(spa_config); + } + /* * Sync out the even labels (L0, L2) for every dirty vdev. If the * system dies in the middle of this process, that's OK: all of the @@ -2112,12 +2501,13 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) * the new labels to disk to ensure that all even-label updates * are committed to stable storage before the uberblock update. */ - if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { + if ((error = vdev_label_sync_list(spa, 0, txg, flags, sc_abd)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the even labels " "of dirty vdevs", error, spa_name(spa)); } + abd_free(sc_abd); goto retry; } @@ -2141,6 +2531,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) zfs_dbgmsg("vdev_uberblock_sync_list() returned error " "%d for pool '%s'", error, spa_name(spa)); } + abd_free(sc_abd); goto retry; } @@ -2157,14 +2548,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { + if ((error = vdev_label_sync_list(spa, 1, txg, flags, sc_abd)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the odd labels of " "dirty vdevs", error, spa_name(spa)); } + abd_free(sc_abd); goto retry; } + abd_free(sc_abd); return (0); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 18efdaac006f..87bdcdf941a8 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -197,8 +197,9 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) */ /* Fix zio_offset for leaf vdevs */ - if (vd->vdev_ops->vdev_op_leaf) - zio_offset += VDEV_LABEL_START_SIZE; + if (vd->vdev_ops->vdev_op_leaf) { + zio_offset += VDEV_LABEL_START_SIZE(vd); + } /* Standard load based on pending queue length. */ load = vdev_queue_length(vd); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 80727b0d8f91..b8b3c5aba71b 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2452,7 +2452,7 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) continue; ASSERT3U(rc->rc_offset + rc->rc_size, <, - cvd->vdev_psize - VDEV_LABEL_END_SIZE); + cvd->vdev_psize - VDEV_LABEL_END_SIZE(cvd)); ASSERT3P(rc->rc_abd, !=, NULL); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, @@ -2463,9 +2463,9 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) if (rc->rc_shadow_devidx != INT_MAX) { vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; - ASSERT3U( - rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, - cvd2->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(rc->rc_shadow_offset + + abd_get_size(rc->rc_abd), <, cvd2->vdev_psize - + VDEV_LABEL_END_SIZE(cvd)); zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, rc->rc_shadow_offset, rc->rc_abd, @@ -2495,8 +2495,8 @@ raidz_start_skip_writes(zio_t *zio) continue; ASSERT0P(rc->rc_abd); - ASSERT3U(rc->rc_offset, <, - cvd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(rc->rc_offset, <, cvd->vdev_psize - + VDEV_LABEL_END_SIZE(cvd)); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, NULL, 1ULL << ashift, zio->io_type, zio->io_priority, @@ -4544,6 +4544,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { + vdev_t *cvd = raidvd->vdev_child[i]; /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE * to the offset to calculate the physical offset to @@ -4551,10 +4552,12 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) * access the scratch area. */ zio_nowait(zio_vdev_child_io(pio, NULL, - raidvd->vdev_child[i], - VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], - write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, - ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); + cvd, (cvd->vdev_large_label ? + VDEV_RESERVE_OFFSET : VDEV_BOOT_OFFSET) - + VDEV_LABEL_START_SIZE(cvd), + abds[i], write_size, ZIO_TYPE_READ, + ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, + raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { @@ -4621,13 +4624,15 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { + vdev_t *cvd = raidvd->vdev_child[i]; /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ - zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], - VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + zio_nowait(zio_vdev_child_io(pio, NULL, cvd, + (cvd->vdev_large_label ? VDEV_RESERVE_OFFSET : + VDEV_BOOT_OFFSET) - VDEV_LABEL_START_SIZE(cvd), abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } @@ -4779,13 +4784,15 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { + vdev_t *cvd = raidvd->vdev_child[i]; /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ - zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], - VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + zio_nowait(zio_vdev_child_io(pio, NULL, cvd, + (cvd->vdev_large_label ? VDEV_RESERVE_OFFSET : + VDEV_BOOT_OFFSET) - VDEV_LABEL_START_SIZE(cvd), abds[i], write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index eee18b367909..18f8dba9594c 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -617,9 +617,9 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { - error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + + error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE(vd) + zfs_rs_get_start(rs, ta->trim_tree) + - (w *extent_bytes_max), MIN(size - + (w * extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { goto done; diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 221f24e381dc..67efd2d10bd6 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1233,6 +1233,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, memcpy(report->zcr_ckinfo, info, sizeof (*info)); } + ASSERT(vd->vdev_top); report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; report->zcr_align = vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); @@ -1248,6 +1249,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, } #endif + ASSERT(zio->io_logical); mutex_enter(&spa->spa_errlist_lock); report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 4cf8912d4269..3efd9bd40242 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1558,13 +1558,14 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; + spa_t *spa = vd->vdev_spa; ASSERT0(vd->vdev_children); - ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE(vd) || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE(vd)); ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + zio = zio_create(pio, spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); @@ -1579,13 +1580,14 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; + spa_t *spa = vd->vdev_spa; ASSERT0(vd->vdev_children); - ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE(vd) || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE(vd)); ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + zio = zio_create(pio, spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); @@ -1662,7 +1664,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); - offset += VDEV_LABEL_START_SIZE; + offset += VDEV_LABEL_START_SIZE(vd); } flags |= ZIO_VDEV_CHILD_FLAGS(pio); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 981a1be4847c..3491d5cfcb86 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -251,7 +251,7 @@ zio_match_dva(zio_t *zio) /* Compensate for vdev label added to leaves */ if (zio->io_vd->vdev_ops->vdev_op_leaf) - off += VDEV_LABEL_START_SIZE; + off += VDEV_LABEL_START_SIZE(zio->io_vd); if (zio->io_vd == vd && zio->io_offset == off) break; @@ -326,9 +326,10 @@ zio_handle_label_injection(zio_t *zio, int error) uint64_t offset = zio->io_offset; int label; int ret = 0; + boolean_t new = vd->vdev_large_label; - if (offset >= VDEV_LABEL_START_SIZE && - offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) + if (offset >= VDEV_LABEL_START_SIZE(vd) && + offset < vd->vdev_psize - VDEV_LABEL_END_SIZE(vd)) return (0); rw_enter(&inject_lock, RW_READER); @@ -346,9 +347,9 @@ zio_handle_label_injection(zio_t *zio, int error) * vdev label. We must determine the label which is being * updated and adjust our region accordingly. */ - label = vdev_label_number(vd->vdev_psize, offset); - start = vdev_label_offset(vd->vdev_psize, label, start); - end = vdev_label_offset(vd->vdev_psize, label, end); + label = vdev_label_number(vd->vdev_psize, offset, new); + start = vdev_label_offset(vd->vdev_psize, label, start, new); + end = vdev_label_offset(vd->vdev_psize, label, end, new); if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && (offset >= start && offset <= end)) { @@ -417,8 +418,8 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) !(zio->io_flags & ZIO_FLAG_PROBE)) { uint64_t offset = zio->io_offset; - if (offset < VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) + if (offset < VDEV_LABEL_START_SIZE(vd) || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE(vd)) return (0); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 36a56ddee586..69dc68f18d16 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -830,6 +830,10 @@ tags = ['functional', 'mv_files'] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] +[tests/functional/large_label] +tests = ['large_label_001_pos', 'large_label_002_pos', 'large_label_mirror'] +tags = ['functional', 'large_label'] + [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', 'enospc_df', 'enospc_ganging', 'enospc_rm'] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index b56ffc3a4a2d..e062b368e198 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -495,6 +495,10 @@ tags = ['functional', 'mmap'] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] +[tests/functional/large_label] +tests = ['large_label_001_pos'] +tags = ['functional', 'large_label'] + [tests/functional/nopwrite] tests = ['nopwrite_sync', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ec5da0defa4a..c5a7bbb4aefd 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -341,6 +341,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/inuse/inuse.cfg \ functional/io/io.cfg \ functional/l2arc/l2arc.cfg \ + functional/large_label/large_label.kshlib \ functional/largest_pool/largest_pool.cfg \ functional/migration/migration.cfg \ functional/migration/migration.kshlib \ @@ -1733,6 +1734,11 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/nestedfs/cleanup.ksh \ functional/nestedfs/nestedfs_001_pos.ksh \ functional/nestedfs/setup.ksh \ + functional/large_label/cleanup.ksh \ + functional/large_label/large_label_001_pos.ksh \ + functional/large_label/large_label_002_pos.ksh \ + functional/large_label/large_label_mirror.ksh \ + functional/large_label/setup.ksh \ functional/nopwrite/cleanup.ksh \ functional/nopwrite/nopwrite_copies.ksh \ functional/nopwrite/nopwrite_mtime.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index bdf5fdf85cff..dc40ea939573 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -93,6 +93,7 @@ typeset -a properties=( "feature@redaction_list_spill" "feature@dynamic_gang_header" "feature@physical_rewrite" + "feature@large_label" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/large_label/cleanup.ksh b/tests/zfs-tests/tests/functional/large_label/cleanup.ksh new file mode 100755 index 000000000000..65ab91e29853 --- /dev/null +++ b/tests/zfs-tests/tests/functional/large_label/cleanup.ksh @@ -0,0 +1,24 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/large_label/large_label.kshlib + +log_must zpool destroy $BASE_POOL +[[ -d "$TESTDIR1" ]] && log_must rm -rf $TESTDIR1 +default_cleanup diff --git a/tests/zfs-tests/tests/functional/large_label/large_label.kshlib b/tests/zfs-tests/tests/functional/large_label/large_label.kshlib new file mode 100644 index 000000000000..0242fc48d3d7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/large_label/large_label.kshlib @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +BASE_POOL="base" + +function uses_old_label +{ + zdb -l $1 | grep -q "LABEL(old)" +} + +function uses_large_label +{ + zdb -l $1 | grep -q "LABEL(new)" +} diff --git a/tests/zfs-tests/tests/functional/large_label/large_label_001_pos.ksh b/tests/zfs-tests/tests/functional/large_label/large_label_001_pos.ksh new file mode 100755 index 000000000000..45b8cabfd440 --- /dev/null +++ b/tests/zfs-tests/tests/functional/large_label/large_label_001_pos.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/large_label/large_label.kshlib + + +# +# DESCRIPTION: +# Verify that new label works for basic pool operation. +# +# STRATEGY: +# 1. Create large virtual disk for the new label type +# 2. Create pool using large-label disk +# 3. Verify disks are using the new label format +# + +function cleanup { + log_pos zpool destroy $TESTPOOL + log_must rm $mntpnt/dsk* +} + +log_assert "Verify that new label works for basic pool operation" +log_onexit cleanup + +mntpnt="$TESTDIR1" +log_must truncate -s 2T $mntpnt/dsk0 + +DSK="$mntpnt/dsk" + +log_must create_pool -f $TESTPOOL "$DSK"0 + +log_must zdb -l "$DSK"0 +log_must uses_large_label "$DSK"0 +log_mustnot uses_old_label "$DSK"0 + +log_pass "New label works for basic pool operation" diff --git a/tests/zfs-tests/tests/functional/large_label/large_label_002_pos.ksh b/tests/zfs-tests/tests/functional/large_label/large_label_002_pos.ksh new file mode 100755 index 000000000000..7a69aaf5bf8b --- /dev/null +++ b/tests/zfs-tests/tests/functional/large_label/large_label_002_pos.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/large_label/large_label.kshlib + + +# +# DESCRIPTION: +# Verify that new label works for more advanced pool operations. +# +# STRATEGY: +# 1. Create large virtual disks for the new label type +# 2. Create pool using large-label disks +# 3. Verify disks are using the new label format +# 4. Perform more advanced pool operations (import, export, scrub, destroy) +# + +function cleanup { + log_pos zpool destroy $TESTPOOL + log_must rm $mntpnt/dsk* +} + +log_assert "Verify that new label works for more advanced pool operations" +log_onexit cleanup + +mntpnt="$TESTDIR1" +log_must truncate -s 2T $mntpnt/dsk{0,1,2,3,4,5,6,7} + +DSK="$mntpnt/dsk" + +log_must create_pool -f $TESTPOOL "$DSK"0 mirror "$DSK"1 "$DSK"2 raidz1 "$DSK"3 "$DSK"4 "$DSK"5 log "$DSK"6 special "$DSK"7 + +log_must dd if=/dev/urandom of=/$TESTPOOL/f1 bs=1M count=1k +log_must sync_pool $TESTPOOL + +for i in `seq 0 7`; do + log_must zdb -l "$DSK""$i" + log_must uses_large_label "$DSK""$i" +done + +log_must zpool scrub -w $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $mntpnt $TESTPOOL + +log_pass "New label works for more advanced pool operations" diff --git a/tests/zfs-tests/tests/functional/large_label/large_label_mirror.ksh b/tests/zfs-tests/tests/functional/large_label/large_label_mirror.ksh new file mode 100755 index 000000000000..a0a78d54b60c --- /dev/null +++ b/tests/zfs-tests/tests/functional/large_label/large_label_mirror.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/large_label/large_label.kshlib + + +# +# DESCRIPTION: Verify that mixed mirrors can exist and work correctly +# +# STRATEGY: +# 1. Create a pool with a small device that uses the old label layout +# 2. Attach a large device that wants to use the new label +# 3. Verify correct pool operation +# 4. Split the pool, verify each sub-pool behaves correctly. +# + +function cleanup { + log_pos zpool destroy $TESTPOOL + log_pos zpool destroy $TESTPOOL2 + log_must rm $mntpnt/*1 +} + +log_assert "Verify that mixed mirrors can exist and work correctly" +log_onexit cleanup + +mntpnt="$TESTDIR1" +log_must truncate -s 2T $mntpnt/big1 +log_must truncate -s 2G $mntpnt/small1 + +log_must create_pool -f $TESTPOOL $mntpnt/small1 +log_must zfs create $TESTPOOL/fs +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/f1 bs=1M count=16 +log_must touch /$TESTPOOL/f2 + +log_must sync_pool $TESTPOOL + +log_must zpool attach $TESTPOOL $mntpnt/small1 $mntpnt/big1 +log_must zpool wait -t resilver $TESTPOOL +log_must uses_large_label $mntpnt/big1 +log_must uses_old_label $mntpnt/small1 +log_must zpool scrub -w $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $mntpnt $TESTPOOL + +log_must zpool split $TESTPOOL $TESTPOOL2 +log_must zpool import -d $mntpnt $TESTPOOL2 +log_must uses_large_label $mntpnt/big1 +log_must uses_old_label $mntpnt/small1 +log_must zpool scrub -w $TESTPOOL +log_must zpool scrub -w $TESTPOOL2 + +log_pass "Mixed mirrors can exist and work correctly" diff --git a/tests/zfs-tests/tests/functional/large_label/setup.ksh b/tests/zfs-tests/tests/functional/large_label/setup.ksh new file mode 100755 index 000000000000..023e5f305079 --- /dev/null +++ b/tests/zfs-tests/tests/functional/large_label/setup.ksh @@ -0,0 +1,24 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/large_label/large_label.kshlib + +[[ ! -d $TESTDIR1 ]] && log_must mkdir -p $TESTDIR1 +log_must create_pool $BASE_POOL $DISKS +log_must zfs set mountpoint=$TESTDIR1 $BASE_POOL