Skip to content

Commit d64ed22

Browse files
author
Paul Dagnelie
committed
Anyraid implementation
Signed-off-by: Paul Dagnelie <[email protected]> Sponsored-by: Eshtek, creators of HexOS Sponsored-by: Klara, Inc.
1 parent 0342ea9 commit d64ed22

File tree

95 files changed

+4545
-354
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+4545
-354
lines changed

cmd/zdb/zdb.c

Lines changed: 487 additions & 5 deletions
Large diffs are not rendered by default.

cmd/zpool/zpool_vdev.c

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
#include "zpool_util.h"
7979
#include <sys/zfs_context.h>
8080
#include <sys/stat.h>
81+
#include <sys/vdev_anyraid.h>
8182

8283
/*
8384
* For any given vdev specification, we can have multiple errors. The
@@ -457,7 +458,8 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b,
457458
{
458459
if ((strcmp(a->zprl_type, "raidz") == 0 ||
459460
strcmp(a->zprl_type, "draid") == 0) &&
460-
strcmp(b->zprl_type, "mirror") == 0) {
461+
(strcmp(b->zprl_type, "mirror") == 0 ||
462+
strcmp(b->zprl_type, "anyraid") == 0)) {
461463
*raidz = a;
462464
*mirror = b;
463465
return (B_TRUE);
@@ -567,6 +569,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
567569
* already reported an error for this spec, so don't
568570
* bother doing it again.
569571
*/
572+
const char *orig_type = type;
570573
type = NULL;
571574
dontreport = 0;
572575
vdev_size = -1LL;
@@ -666,7 +669,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
666669
if (!dontreport &&
667670
(vdev_size != -1LL &&
668671
(llabs(size - vdev_size) >
669-
ZPOOL_FUZZ))) {
672+
ZPOOL_FUZZ)) && strcmp(orig_type,
673+
VDEV_TYPE_ANYRAID) != 0) {
670674
if (ret != NULL)
671675
free(ret);
672676
ret = NULL;
@@ -746,19 +750,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
746750
else
747751
return (NULL);
748752
}
749-
} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
750-
0) {
751-
if (ret != NULL)
752-
free(ret);
753-
ret = NULL;
754-
if (fatal)
755-
vdev_error(gettext(
756-
"mismatched replication level: "
757-
"both %s and %s vdevs are "
758-
"present\n"),
759-
lastrep.zprl_type, rep.zprl_type);
760-
else
761-
return (NULL);
762753
} else if (lastrep.zprl_parity != rep.zprl_parity) {
763754
if (ret)
764755
free(ret);
@@ -1220,7 +1211,7 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
12201211
}
12211212

12221213
/*
1223-
* Returns the parity level extracted from a raidz or draid type.
1214+
* Returns the parity level extracted from a raidz, anyraid, or draid type.
12241215
* If the parity cannot be determined zero is returned.
12251216
*/
12261217
static int
@@ -1248,6 +1239,22 @@ get_parity(const char *type)
12481239
return (0);
12491240
}
12501241
}
1242+
} else if (strncmp(type, VDEV_TYPE_ANYRAID,
1243+
strlen(VDEV_TYPE_ANYRAID)) == 0) {
1244+
p = type + strlen(VDEV_TYPE_ANYRAID);
1245+
1246+
if (*p == '\0') {
1247+
/* when unspecified default to 1-parity mirror */
1248+
return (1);
1249+
} else {
1250+
char *end;
1251+
errno = 0;
1252+
parity = strtol(p, &end, 10);
1253+
if (errno != 0 || *end != '\0' ||
1254+
parity < 0 || parity > VDEV_ANYRAID_MAXPARITY) {
1255+
return (0);
1256+
}
1257+
}
12511258
} else if (strncmp(type, VDEV_TYPE_DRAID,
12521259
strlen(VDEV_TYPE_DRAID)) == 0) {
12531260
p = type + strlen(VDEV_TYPE_DRAID);
@@ -1305,6 +1312,15 @@ is_grouping(const char *type, int *mindev, int *maxdev)
13051312
if (maxdev != NULL)
13061313
*maxdev = INT_MAX;
13071314

1315+
if (strncmp(type, VDEV_TYPE_ANYRAID, strlen(VDEV_TYPE_ANYRAID)) == 0) {
1316+
nparity = get_parity(type);
1317+
if (mindev != NULL)
1318+
*mindev = nparity + 1;
1319+
if (maxdev != NULL)
1320+
*maxdev = 255;
1321+
return (VDEV_TYPE_ANYRAID);
1322+
}
1323+
13081324
if (strcmp(type, "mirror") == 0) {
13091325
if (mindev != NULL)
13101326
*mindev = 2;
@@ -1339,6 +1355,22 @@ is_grouping(const char *type, int *mindev, int *maxdev)
13391355
return (NULL);
13401356
}
13411357

1358+
static int
1359+
anyraid_config_by_type(nvlist_t *nv, const char *type)
1360+
{
1361+
uint64_t nparity = 0;
1362+
1363+
if (strncmp(type, VDEV_TYPE_ANYRAID, strlen(VDEV_TYPE_ANYRAID)) != 0)
1364+
return (EINVAL);
1365+
1366+
nparity = (uint64_t)get_parity(type);
1367+
1368+
fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, VAP_MIRROR);
1369+
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
1370+
1371+
return (0);
1372+
}
1373+
13421374
/*
13431375
* Extract the configuration parameters encoded in the dRAID type and
13441376
* use them to generate a dRAID configuration. The expected format is:
@@ -1524,9 +1556,9 @@ construct_spec(nvlist_t *props, int argc, char **argv)
15241556
nv = NULL;
15251557

15261558
/*
1527-
* If it's a mirror, raidz, or draid the subsequent arguments
1528-
* are its leaves -- until we encounter the next mirror,
1529-
* raidz or draid.
1559+
* If it's a mirror, raidz, anyraid, or draid the subsequent
1560+
* arguments are its leaves -- until we encounter the next
1561+
* mirror, raidz, anyraid, or draid.
15301562
*/
15311563
if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
15321564
nvlist_t **child = NULL;
@@ -1593,7 +1625,12 @@ construct_spec(nvlist_t *props, int argc, char **argv)
15931625
}
15941626

15951627
if (is_log) {
1596-
if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1628+
/*
1629+
* TODO: only AnyRAID mirror is expected to be
1630+
* allowed.
1631+
*/
1632+
if (strcmp(type, VDEV_TYPE_MIRROR) != 0 &&
1633+
strcmp(type, VDEV_TYPE_ANYRAID) != 0) {
15971634
(void) fprintf(stderr,
15981635
gettext("invalid vdev "
15991636
"specification: unsupported 'log' "
@@ -1683,6 +1720,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
16831720
ZPOOL_CONFIG_NPARITY,
16841721
mindev - 1) == 0);
16851722
}
1723+
if (strcmp(type, VDEV_TYPE_ANYRAID) == 0) {
1724+
if (anyraid_config_by_type(nv, fulltype)
1725+
!= 0) {
1726+
for (c = 0; c < children; c++)
1727+
nvlist_free(child[c]);
1728+
free(child);
1729+
goto spec_out;
1730+
}
1731+
}
16861732
if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
16871733
if (draid_config_by_type(nv,
16881734
fulltype, children) != 0) {

cmd/ztest.c

Lines changed: 69 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
#include <sys/zio.h>
107107
#include <sys/zil.h>
108108
#include <sys/zil_impl.h>
109+
#include <sys/vdev_anyraid.h>
109110
#include <sys/vdev_draid.h>
110111
#include <sys/vdev_impl.h>
111112
#include <sys/vdev_file.h>
@@ -278,6 +279,7 @@ extern uint64_t raidz_expand_max_reflow_bytes;
278279
extern uint_t raidz_expand_pause_point;
279280
extern boolean_t ddt_prune_artificial_age;
280281
extern boolean_t ddt_dump_prune_histogram;
282+
extern uint64_t zfs_anyraid_min_tile_size;
281283

282284

283285
static ztest_shared_opts_t *ztest_shared_opts;
@@ -673,10 +675,12 @@ fatal(int do_perror, const char *message, ...)
673675
fatal_msg = buf; /* to ease debugging */
674676

675677
out:
676-
if (ztest_dump_core)
678+
if (ztest_dump_core) {
677679
abort();
678-
else
680+
} else {
681+
// NOTE: Not safe if we've called kernel_fini already
679682
dump_debug_buffer();
683+
}
680684

681685
exit(3);
682686
}
@@ -769,7 +773,7 @@ static ztest_option_t option_table[] = {
769773
DEFAULT_RAID_CHILDREN, NULL},
770774
{ 'R', "raid-parity", "INTEGER", "Raid parity",
771775
DEFAULT_RAID_PARITY, NULL},
772-
{ 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind",
776+
{ 'K', "raid-kind", "raidz|eraidz|draid|anyraid|random", "Raid kind",
773777
NO_DEFAULT, "random"},
774778
{ 'D', "draid-data", "INTEGER", "Number of draid data drives",
775779
DEFAULT_DRAID_DATA, NULL},
@@ -1119,7 +1123,7 @@ process_options(int argc, char **argv)
11191123
}
11201124

11211125
if (strcmp(raid_kind, "random") == 0) {
1122-
switch (ztest_random(3)) {
1126+
switch (ztest_random(4)) {
11231127
case 0:
11241128
raid_kind = "raidz";
11251129
break;
@@ -1129,6 +1133,9 @@ process_options(int argc, char **argv)
11291133
case 2:
11301134
raid_kind = "draid";
11311135
break;
1136+
case 3:
1137+
raid_kind = "anyraid";
1138+
break;
11321139
}
11331140

11341141
if (ztest_opts.zo_verbose >= 3)
@@ -1180,11 +1187,25 @@ process_options(int argc, char **argv)
11801187
zo->zo_raid_parity = MIN(zo->zo_raid_parity,
11811188
zo->zo_raid_children - 1);
11821189

1183-
} else /* using raidz */ {
1184-
ASSERT0(strcmp(raid_kind, "raidz"));
1190+
} else if (strcmp(raid_kind, "raidz") == 0) {
1191+
zo->zo_raid_parity = MIN(zo->zo_raid_parity,
1192+
zo->zo_raid_children - 1);
1193+
} else if (strcmp(raid_kind, "anyraid") == 0) {
1194+
uint64_t min_devsize;
1195+
1196+
/* With fewer disks use 1G, otherwise 512M is OK */
1197+
min_devsize = (ztest_opts.zo_raid_children < 16) ?
1198+
(1ULL << 30) : (512ULL << 20);
1199+
if (zo->zo_vdev_size < min_devsize)
1200+
zo->zo_vdev_size = min_devsize;
11851201

11861202
zo->zo_raid_parity = MIN(zo->zo_raid_parity,
11871203
zo->zo_raid_children - 1);
1204+
1205+
(void) strlcpy(zo->zo_raid_type, VDEV_TYPE_ANYRAID,
1206+
sizeof (zo->zo_raid_type));
1207+
} else {
1208+
fatal(B_FALSE, "invalid raid kind %s", raid_kind);
11881209
}
11891210

11901211
zo->zo_vdevtime =
@@ -1375,6 +1396,9 @@ make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size,
13751396
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata);
13761397
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
13771398
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
1399+
} else if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_ANYRAID) == 0) {
1400+
fnvlist_add_uint8(raid, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE,
1401+
VAP_MIRROR);
13781402
}
13791403

13801404
for (c = 0; c < r; c++)
@@ -3165,7 +3189,8 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
31653189
return;
31663190

31673191
/* dRAID added after feature flags, skip upgrade test. */
3168-
if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0)
3192+
if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0 ||
3193+
strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_ANYRAID) == 0)
31693194
return;
31703195

31713196
mutex_enter(&ztest_vdev_lock);
@@ -3789,28 +3814,47 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
37893814
if (ztest_opts.zo_raid_children > 1) {
37903815
if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0)
37913816
ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops);
3817+
else if (strcmp(oldvd->vdev_ops->vdev_op_type, "anyraid") == 0)
3818+
ASSERT3P(oldvd->vdev_ops, ==, &vdev_anyraid_ops);
37923819
else
37933820
ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops);
37943821
oldvd = oldvd->vdev_child[leaf % raidz_children];
37953822
}
37963823

3824+
if (!replacing && oldvd->vdev_parent->vdev_ops == &vdev_anyraid_ops) {
3825+
oldvd = oldvd->vdev_parent;
3826+
}
3827+
37973828
/*
37983829
* If we're already doing an attach or replace, oldvd may be a
3799-
* mirror vdev -- in which case, pick a random child.
3830+
* mirror vdev -- in which case, pick a random child. For anyraid vdevs,
3831+
* attachment occurs at the parent level.
38003832
*/
3801-
while (oldvd->vdev_children != 0) {
3833+
while (oldvd->vdev_children != 0 && oldvd->vdev_ops !=
3834+
&vdev_anyraid_ops) {
38023835
oldvd_has_siblings = B_TRUE;
38033836
ASSERT3U(oldvd->vdev_children, >=, 2);
38043837
oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
38053838
}
38063839

38073840
oldguid = oldvd->vdev_guid;
3808-
oldsize = vdev_get_min_asize(oldvd);
3841+
if (oldvd->vdev_ops != &vdev_anyraid_ops)
3842+
oldsize = vdev_get_min_asize(oldvd);
3843+
else
3844+
oldsize = oldvd->vdev_child[
3845+
ztest_random(oldvd->vdev_children)]->vdev_asize;
38093846
oldvd_is_log = oldvd->vdev_top->vdev_islog;
38103847
oldvd_is_special =
38113848
oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL ||
38123849
oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP;
3813-
(void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN);
3850+
if (oldvd->vdev_path == NULL) {
3851+
ASSERT3P(oldvd->vdev_ops, ==, &vdev_anyraid_ops);
3852+
snprintf(oldpath, MAXPATHLEN, "%s-%llu",
3853+
oldvd->vdev_ops->vdev_op_type,
3854+
(u_longlong_t)oldvd->vdev_id);
3855+
} else {
3856+
(void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN);
3857+
}
38143858
pvd = oldvd->vdev_parent;
38153859
pguid = pvd->vdev_guid;
38163860

@@ -3819,7 +3863,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
38193863
* to the detach the pool is scrubbed in order to prevent creating
38203864
* unrepairable blocks as a result of the data corruption injection.
38213865
*/
3822-
if (oldvd_has_siblings && ztest_random(2) == 0) {
3866+
if (oldvd_has_siblings && oldvd->vdev_ops != &vdev_anyraid_ops &&
3867+
ztest_random(2) == 0) {
38233868
spa_config_exit(spa, SCL_ALL, FTAG);
38243869

38253870
error = ztest_scrub_impl(spa);
@@ -3883,7 +3928,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
38833928
* If newvd is a distributed spare and it's being attached to a
38843929
* dRAID which is not its parent it should fail with ENOTSUP.
38853930
*/
3886-
if (pvd->vdev_ops != &vdev_mirror_ops &&
3931+
if (oldvd->vdev_ops == &vdev_anyraid_ops)
3932+
expected_error = 0;
3933+
else if (pvd->vdev_ops != &vdev_mirror_ops &&
38873934
pvd->vdev_ops != &vdev_root_ops && (!replacing ||
38883935
pvd->vdev_ops == &vdev_replacing_ops ||
38893936
pvd->vdev_ops == &vdev_spare_ops))
@@ -3895,7 +3942,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
38953942
expected_error = replacing ? 0 : EBUSY;
38963943
else if (vdev_lookup_by_path(rvd, newpath) != NULL)
38973944
expected_error = EBUSY;
3898-
else if (!newvd_is_dspare && newsize < oldsize)
3945+
else if (newsize < oldsize && !(newvd_is_dspare ||
3946+
(pvd->vdev_ops == &vdev_anyraid_ops &&
3947+
newsize < pvd->vdev_ops->vdev_op_min_asize(pvd, oldvd))))
38993948
expected_error = EOVERFLOW;
39003949
else if (ashift > oldvd->vdev_top->vdev_ashift)
39013950
expected_error = EDOM;
@@ -3916,8 +3965,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
39163965
* When supported select either a healing or sequential resilver.
39173966
*/
39183967
boolean_t rebuilding = B_FALSE;
3919-
if (pvd->vdev_ops == &vdev_mirror_ops ||
3920-
pvd->vdev_ops == &vdev_root_ops) {
3968+
if (oldvd->vdev_ops != &vdev_anyraid_ops &&
3969+
(pvd->vdev_ops == &vdev_mirror_ops ||
3970+
pvd->vdev_ops == &vdev_root_ops)) {
39213971
rebuilding = !!ztest_random(2);
39223972
}
39233973

@@ -8994,6 +9044,9 @@ main(int argc, char **argv)
89949044
metaslab_df_alloc_threshold =
89959045
zs->zs_metaslab_df_alloc_threshold;
89969046

9047+
zfs_anyraid_min_tile_size = MIN(zfs_anyraid_min_tile_size,
9048+
ztest_opts.zo_vdev_size / 8);
9049+
89979050
if (zs->zs_do_init)
89989051
ztest_run_init();
89999052
else

include/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ COMMON_H = \
100100
sys/unique.h \
101101
sys/uuid.h \
102102
sys/vdev.h \
103+
sys/vdev_anyraid.h \
103104
sys/vdev_disk.h \
104105
sys/vdev_draid.h \
105106
sys/vdev_file.h \

include/os/linux/kernel/linux/mod_compat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ typedef const struct kernel_param zfs_kernel_param_t;
3838

3939
enum scope_prefix_types {
4040
zfs,
41+
zfs_anyraid,
4142
zfs_arc,
4243
zfs_brt,
4344
zfs_condense,

0 commit comments

Comments
 (0)