Skip to content

Commit 7b63670

Browse files
author
Paul Dagnelie
committed
Implement new label format for large disks
This patch contains the logic for a new larger label format. This format is intended to support disks with large sector sizes. By using a larger label we can store more uberblocks and other critical pool metadata. We can also use the extra space to enable new features in ZFS going forwards. This initial commit does not add new capabilities, but provides the framework for them going forwards. Signed-off-by: Paul Dagnelie <[email protected]> Sponsored-by: Wasabi, Inc. Sponsored-by: Klara, Inc.
1 parent 92da9e0 commit 7b63670

37 files changed

+1514
-210
lines changed

cmd/zdb/zdb.c

Lines changed: 240 additions & 48 deletions
Large diffs are not rendered by default.

cmd/zhack.c

Lines changed: 276 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -496,11 +496,11 @@ zhack_do_feature(int argc, char **argv)
496496
return (0);
497497
}
498498

499-
#define ASHIFT_UBERBLOCK_SHIFT(ashift) \
499+
#define ASHIFT_UBERBLOCK_SHIFT(ashift, new) \
500500
MIN(MAX(ashift, UBERBLOCK_SHIFT), \
501-
MAX_UBERBLOCK_SHIFT)
502-
#define ASHIFT_UBERBLOCK_SIZE(ashift) \
503-
(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift))
501+
MAX_UBERBLOCK_SHIFT(new))
502+
#define ASHIFT_UBERBLOCK_SIZE(ashift, new) \
503+
(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift, new))
504504

505505
#define REPAIR_LABEL_STATUS_CKSUM (1 << 0)
506506
#define REPAIR_LABEL_STATUS_UB (1 << 1)
@@ -525,6 +525,26 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl,
525525
return (0);
526526
}
527527

528+
static int
529+
zhack_repair_read(const int fd, uint8_t *buf, size_t buflen,
530+
const uint64_t offset, const int l)
531+
{
532+
const int err = pread64(fd, buf, buflen, offset);
533+
534+
if (err == -1) {
535+
(void) fprintf(stderr,
536+
"error: cannot read buffer at %lu for label %d: %s\n",
537+
offset, l, strerror(errno));
538+
return (err);
539+
} else if (err != buflen) {
540+
(void) fprintf(stderr,
541+
"error: bad read size at %lu for label %d \n", offset, l);
542+
return (err);
543+
}
544+
545+
return (0);
546+
}
547+
528548
static void
529549
zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
530550
const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum)
@@ -687,7 +707,7 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
687707
(char *)vl + offsetof(vdev_label_t, vl_uberblock);
688708
zio_eck_t *ub_eck =
689709
(zio_eck_t *)
690-
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1;
710+
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1;
691711

692712
if (ub_eck->zec_magic != 0) {
693713
(void) fprintf(stderr,
@@ -706,10 +726,39 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
706726
if (zhack_repair_write_label(l, fd, byteswap,
707727
ub_data, ub_eck,
708728
label_offset + offsetof(vdev_label_t, vl_uberblock),
709-
ASHIFT_UBERBLOCK_SIZE(ashift)))
729+
ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE)))
710730
labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
711731
}
712732

733+
static void
734+
zhack_repair_write_uberblock_new(void *ub_data, const int l,
735+
const uint64_t ashift, const int fd, const int byteswap,
736+
const uint64_t label_offset, uint32_t *labels_repaired)
737+
{
738+
zio_eck_t *ub_eck =
739+
(zio_eck_t *)
740+
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1;
741+
742+
if (ub_eck->zec_magic != 0) {
743+
(void) fprintf(stderr,
744+
"error: label %d: "
745+
"Expected Uberblock checksum magic number to "
746+
"be 0, but got %" PRIu64 "\n",
747+
l, ub_eck->zec_magic);
748+
(void) fprintf(stderr, "It would appear there's already "
749+
"a checksum for the uberblock.\n");
750+
return;
751+
}
752+
753+
754+
ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
755+
756+
if (zhack_repair_write_label(l, fd, byteswap,
757+
ub_data, ub_eck, label_offset + VDEV_LARGE_UBERBLOCK_RING,
758+
ASHIFT_UBERBLOCK_SIZE(ashift, B_TRUE)))
759+
labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
760+
}
761+
713762
static void
714763
zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
715764
{
@@ -723,12 +772,13 @@ zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
723772

724773
static int
725774
zhack_repair_test_cksum(const int byteswap, void *vdev_data,
726-
zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l)
775+
const uint64_t size, zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset,
776+
const int l)
727777
{
728778
const zio_cksum_t expected_cksum = vdev_eck->zec_cksum;
729779
zio_cksum_t actual_cksum;
730780
zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset,
731-
VDEV_PHYS_SIZE, vdev_eck, &actual_cksum);
781+
size, vdev_eck, &actual_cksum);
732782
const uint64_t expected_magic = byteswap ?
733783
BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
734784
const uint64_t actual_magic = vdev_eck->zec_magic;
@@ -756,15 +806,17 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,
756806

757807
static void
758808
zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
759-
vdev_label_t *vl, const uint64_t label_offset, const int l,
760-
uint32_t *labels_repaired)
809+
vdev_label_t *vl, const uint64_t filesize, const int l,
810+
uint32_t *labels_repaired, boolean_t *large_label)
761811
{
762812
ssize_t err;
763813
uberblock_t *ub = (uberblock_t *)vl->vl_uberblock;
764814
void *vdev_data =
765815
(char *)vl + offsetof(vdev_label_t, vl_vdev_phys);
766816
zio_eck_t *vdev_eck =
767817
(zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;
818+
const uint64_t label_offset = vdev_label_offset(filesize, l, 0,
819+
B_FALSE);
768820
const uint64_t vdev_phys_offset =
769821
label_offset + offsetof(vdev_label_t, vl_vdev_phys);
770822
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
@@ -798,8 +850,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
798850
}
799851

800852
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
801-
zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck,
802-
vdev_phys_offset, l) != 0) {
853+
zhack_repair_test_cksum(byteswap, vdev_data, VDEV_PHYS_SIZE,
854+
vdev_eck, vdev_phys_offset, l) != 0) {
803855
(void) fprintf(stderr, "It would appear checksums are "
804856
"corrupted. Try zhack repair label -c <device>\n");
805857
return;
@@ -812,6 +864,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
812864
"error: cannot unpack nvlist label %d\n", l);
813865
return;
814866
}
867+
(void) nvlist_lookup_boolean_value(cfg, ZPOOL_CONFIG_LARGE_LABEL,
868+
large_label);
815869

816870
err = zhack_repair_check_label(ub,
817871
l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift);
@@ -836,13 +890,212 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
836890

837891
zhack_repair_write_uberblock(vl,
838892
l, ashift, fd, byteswap, label_offset, labels_repaired);
893+
if (large_label) {
894+
zhack_repair_write_uberblock_new(ub, l, ashift,
895+
fd, byteswap, vdev_label_offset(filesize, l, 0,
896+
B_TRUE), labels_repaired);
897+
}
839898
}
840899

841900
if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck,
842901
vdev_phys_offset, VDEV_PHYS_SIZE))
843-
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
902+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
903+
904+
fsync(fd);
905+
}
906+
907+
static void
908+
zhack_repair_one_label_large(const zhack_repair_op_t op, const int fd,
909+
const uint64_t label_offset, const int l, uint32_t *labels_repaired)
910+
{
911+
ssize_t err;
912+
void *toc_data = NULL, *bootenv = NULL, *vdev_config = NULL;
913+
void *spa_config = NULL, *ub = NULL;
914+
/*
915+
* Note that currently, this can't handle disks with larger than 8k
916+
* sector sizes. That needs to be fixed eventually.
917+
*/
918+
toc_data = malloc(VDEV_TOC_SIZE);
919+
err = zhack_repair_read(fd, toc_data, VDEV_TOC_SIZE, label_offset, l);
920+
if (err)
921+
goto out;
922+
923+
zio_eck_t *toc_eck = (zio_eck_t *)(toc_data + VDEV_TOC_SIZE) - 1;
924+
if (toc_eck->zec_magic == 0) {
925+
(void) fprintf(stderr, "error: label %d: "
926+
"Expected the nvlist checksum magic number to not be zero"
927+
"\n",
928+
l);
929+
(void) fprintf(stderr, "There should already be a checksum "
930+
"for the label.\n");
931+
goto out;
932+
}
933+
934+
int byteswap =
935+
(toc_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));
936+
937+
if (byteswap) {
938+
byteswap_uint64_array(&toc_eck->zec_cksum,
939+
sizeof (zio_cksum_t));
940+
toc_eck->zec_magic = BSWAP_64(toc_eck->zec_magic);
941+
}
942+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
943+
zhack_repair_test_cksum(byteswap, toc_data, VDEV_TOC_SIZE,
944+
toc_eck, label_offset, l) != 0) {
945+
(void) fprintf(stderr, "It would appear checksums are "
946+
"corrupted. Try zhack repair label -c <device>\n");
947+
goto out;
948+
}
949+
950+
nvlist_t *toc;
951+
err = nvlist_unpack(toc_data, VDEV_TOC_SIZE, &toc, 0);
952+
if (err) {
953+
(void) fprintf(stderr,
954+
"error: cannot unpack nvlist TOC %d\n", l);
955+
goto out;
956+
}
957+
958+
uint32_t bootenv_size, vc_size, sc_size;
959+
if ((err = nvlist_lookup_uint32(toc, VDEV_TOC_BOOT_REGION,
960+
&bootenv_size)) || (err = nvlist_lookup_uint32(toc,
961+
VDEV_TOC_VDEV_CONFIG, &vc_size)) || (err = nvlist_lookup_uint32(toc,
962+
VDEV_TOC_POOL_CONFIG, &sc_size))) {
963+
(void) fprintf(stderr,
964+
"error: TOC missing core fields %d\n", l);
965+
goto out;
966+
}
967+
bootenv = malloc(bootenv_size);
968+
zio_eck_t *bootenv_eck = (zio_eck_t *)(bootenv + bootenv_size) - 1;
969+
vdev_config = malloc(vc_size);
970+
zio_eck_t *vc_eck = (zio_eck_t *)(vdev_config + vc_size) - 1;
971+
spa_config = malloc(sc_size);
972+
zio_eck_t *sc_eck = (zio_eck_t *)(spa_config + sc_size) - 1;
973+
974+
uint64_t offset = label_offset + VDEV_TOC_SIZE;
975+
if (bootenv_size != 0) {
976+
if ((err = zhack_repair_read(fd, bootenv,
977+
bootenv_size, offset, l)))
978+
goto out;
979+
if (byteswap) {
980+
byteswap_uint64_array(&bootenv_eck->zec_cksum,
981+
sizeof (zio_cksum_t));
982+
bootenv_eck->zec_magic =
983+
BSWAP_64(bootenv_eck->zec_magic);
984+
}
985+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
986+
zhack_repair_test_cksum(byteswap, bootenv, bootenv_size,
987+
bootenv_eck, offset, l) != 0) {
988+
(void) fprintf(stderr, "It would appear checksums are "
989+
"corrupted. Try zhack repair label -c <device>\n");
990+
goto out;
991+
}
992+
}
993+
994+
offset += bootenv_size;
995+
if ((err = zhack_repair_read(fd, vdev_config, vc_size, offset, l)))
996+
goto out;
997+
998+
if (byteswap) {
999+
byteswap_uint64_array(&sc_eck->zec_cksum,
1000+
sizeof (zio_cksum_t));
1001+
vc_eck->zec_magic = BSWAP_64(vc_eck->zec_magic);
1002+
}
1003+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1004+
zhack_repair_test_cksum(byteswap, vdev_config, vc_size,
1005+
vc_eck, offset, l) != 0) {
1006+
(void) fprintf(stderr, "It would appear checksums are "
1007+
"corrupted. Try zhack repair label -c <device>\n");
1008+
goto out;
1009+
}
1010+
offset += vc_size;
1011+
if ((err = zhack_repair_read(fd, spa_config, sc_size, offset, l)))
1012+
goto out;
1013+
1014+
if (byteswap) {
1015+
byteswap_uint64_array(&sc_eck->zec_cksum,
1016+
sizeof (zio_cksum_t));
1017+
vc_eck->zec_magic = BSWAP_64(sc_eck->zec_magic);
1018+
}
1019+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1020+
zhack_repair_test_cksum(byteswap, spa_config, sc_size,
1021+
sc_eck, offset, l) != 0) {
1022+
(void) fprintf(stderr, "It would appear checksums are "
1023+
"corrupted. Try zhack repair label -c <device>\n");
1024+
goto out;
1025+
}
1026+
1027+
nvlist_t *cfg;
1028+
err = nvlist_unpack(vdev_config, vc_size - sizeof (zio_eck_t), &cfg, 0);
1029+
if (err) {
1030+
(void) fprintf(stderr,
1031+
"error: cannot unpack nvlist label %d\n", l);
1032+
return;
1033+
}
1034+
1035+
ub = malloc(UBERBLOCK_SHIFT);
1036+
err = zhack_repair_read(fd, ub, UBERBLOCK_SHIFT,
1037+
label_offset + VDEV_LARGE_UBERBLOCK_RING, l);
1038+
if (err)
1039+
goto out;
1040+
1041+
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
1042+
ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
1043+
nvlist_t *vdev_tree_cfg = NULL;
1044+
uint64_t ashift;
1045+
err = zhack_repair_check_label(ub, l, cfg_keys, ARRAY_SIZE(cfg_keys),
1046+
cfg, vdev_tree_cfg, &ashift);
1047+
if (err)
1048+
return;
1049+
1050+
if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) {
1051+
char *buf;
1052+
size_t buflen;
1053+
1054+
err = zhack_repair_undetach(ub, cfg, l);
1055+
if (err)
1056+
return;
1057+
1058+
buf = vdev_config;
1059+
buflen = vc_size - sizeof (zio_eck_t);
1060+
if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
1061+
(void) fprintf(stderr,
1062+
"error: label %d: Failed to pack nvlist\n", l);
1063+
return;
1064+
}
1065+
1066+
zhack_repair_write_uberblock_new(ub, l, ashift, fd, byteswap,
1067+
label_offset, labels_repaired);
1068+
}
1069+
1070+
offset = label_offset;
1071+
if (zhack_repair_write_label(l, fd, byteswap, toc_data, toc_eck,
1072+
offset, VDEV_TOC_SIZE))
1073+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1074+
offset += VDEV_TOC_SIZE;
1075+
if (zhack_repair_write_label(l, fd, byteswap, bootenv, bootenv_eck,
1076+
offset, bootenv_size))
1077+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1078+
offset += bootenv_size;
1079+
if (zhack_repair_write_label(l, fd, byteswap, vdev_config, vc_eck,
1080+
offset, vc_size))
1081+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1082+
offset += vc_size;
1083+
if (zhack_repair_write_label(l, fd, byteswap, spa_config, sc_eck,
1084+
offset, sc_size))
1085+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
8441086

8451087
fsync(fd);
1088+
out:
1089+
if (toc_data)
1090+
free(toc_data);
1091+
if (bootenv)
1092+
free(bootenv);
1093+
if (vdev_config)
1094+
free(vdev_config);
1095+
if (spa_config)
1096+
free(spa_config);
1097+
if (ub)
1098+
free(ub);
8461099
}
8471100

8481101
static const char *
@@ -885,9 +1138,18 @@ zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv)
8851138
filesize =
8861139
(filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t);
8871140

1141+
boolean_t large_label = B_FALSE;
8881142
for (int l = 0; l < VDEV_LABELS; l++) {
8891143
zhack_repair_one_label(op, fd, &labels[l],
890-
vdev_label_offset(filesize, l, 0), l, labels_repaired);
1144+
filesize, l, labels_repaired, &large_label);
1145+
if (large_label)
1146+
break;
1147+
}
1148+
if (large_label) {
1149+
for (int l = 0; l < VDEV_LABELS; l++) {
1150+
zhack_repair_one_label_large(op, fd,
1151+
filesize, l, labels_repaired);
1152+
}
8911153
}
8921154

8931155
close(fd);

0 commit comments

Comments
 (0)