Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle interaction between gang blocks, copies, and FDT. #17123

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_is_gang(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted);

Expand Down
11 changes: 11 additions & 0 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,17 @@ ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
return (ddp->ddp_trad[v].ddp_phys_birth);
}

int
ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
ASSERT3U(v, <, DDT_PHYS_NONE);

const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;

return (DVA_GET_GANG(&dvas[0]));
}

int
ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted)
Expand Down
75 changes: 62 additions & 13 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -3145,6 +3145,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* This value respects the redundant_metadata property.
*/
int gbh_copies = gio->io_prop.zp_gang_copies;
if (gbh_copies == 0) {
/*
* This should only happen in the case where we're filling in
* DDT entries for a parent that wants more copies than the DDT
* has. In that case, we cannot gang without creating a mixed
* blkptr, which is illegal.
*/
ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT);
pio->io_error = EAGAIN;
return (pio);
}
ASSERT3S(gbh_copies, >, 0);
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);

Expand Down Expand Up @@ -3614,16 +3625,6 @@ zio_ddt_child_write_done(zio_t *zio)
return;
}

/*
* We've successfully added new DVAs to the entry. Clear the saved
* state or, if there's still outstanding IO, remember it so we can
* revert to a known good state if that IO fails.
*/
if (dde->dde_io->dde_lead_zio[p] == NULL)
ddt_phys_clear(orig, v);
else
ddt_phys_copy(orig, ddp, v);

/*
* Add references for all dedup writes that were waiting on the
* physical one, skipping any other physical writes that are waiting.
Expand All @@ -3635,6 +3636,16 @@ zio_ddt_child_write_done(zio_t *zio)
ddt_phys_addref(ddp, v);
}

/*
* We've successfully added new DVAs to the entry. Clear the saved
* state or, if there's still outstanding IO, remember it so we can
* revert to a known good state if that IO fails.
*/
if (dde->dde_io->dde_lead_zio[p] == NULL)
ddt_phys_clear(orig, v);
else
ddt_phys_copy(orig, ddp, v);

ddt_exit(ddt);
}

Expand All @@ -3650,6 +3661,16 @@ zio_ddt_child_write_ready(zio_t *zio)
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

if (ddt_phys_is_gang(dde->dde_phys, v)) {
for (int i = 0; i < BP_GET_NDVAS(zio->io_bp); i++) {
dva_t *d = &zio->io_bp->blk_dva[i];
metaslab_group_alloc_decrement(zio->io_spa,
DVA_GET_VDEV(d), METASLAB_ASYNC_ALLOC,
zio->io_allocator, B_FALSE, zio);
}
zio->io_error = EAGAIN;
}

if (zio->io_error != 0)
return;

Expand Down Expand Up @@ -3769,9 +3790,12 @@ zio_ddt_write(zio_t *zio)
*/
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
boolean_t is_ganged = ddt_phys_is_gang(ddp, v);

/* Number of DVAs requested bya the IO. */
/* Number of DVAs requested by the IO. */
uint8_t need_dvas = zp->zp_copies;
/* Number of DVAs in outstanding writes for this dde. */
uint8_t parent_dvas = 0;

/*
* What we do next depends on whether or not there's IO outstanding that
Expand Down Expand Up @@ -3901,7 +3925,7 @@ zio_ddt_write(zio_t *zio)
zio_t *pio =
zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
ASSERT(pio);
uint8_t parent_dvas = pio->io_prop.zp_copies;
parent_dvas = pio->io_prop.zp_copies;

if (parent_dvas >= need_dvas) {
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
Expand All @@ -3916,13 +3940,27 @@ zio_ddt_write(zio_t *zio)
need_dvas -= parent_dvas;
}

if (is_ganged) {
zp->zp_dedup = B_FALSE;
BP_SET_DEDUP(bp, B_FALSE);
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (zio);
}

/*
* We need to write. We will create a new write with the copies
* property adjusted to match the number of DVAs we need to need to
* grow the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
czp.zp_copies = czp.zp_gang_copies = need_dvas;
if (have_dvas > 0 || parent_dvas > 0) {
czp.zp_copies = need_dvas;
czp.zp_gang_copies = 0;
} else {
ASSERT3U(czp.zp_copies, ==, need_dvas);
}

zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio_ddt_child_write_ready, NULL,
Expand Down Expand Up @@ -4106,6 +4144,7 @@ zio_dva_allocate(zio_t *zio)
ASSERT(BP_IS_HOLE(bp));
ASSERT0(BP_GET_NDVAS(bp));
ASSERT3U(zio->io_prop.zp_copies, >, 0);

ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));

Expand Down Expand Up @@ -5391,6 +5430,16 @@ zio_done(zio_t *zio)
}

if (zio->io_error && zio == zio->io_logical) {

/*
* A DDT child tried to create a mixed gang/non-gang BP. We're
* going to have to just retry as a non-dedup IO.
*/
if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
zio->io_prop.zp_dedup) {
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
zio->io_prop.zp_dedup = B_FALSE;
}
/*
* Determine whether zio should be reexecuted. This will
* propagate all the way to the root via zio_notify_parent().
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
tags = ['functional', 'features', 'large_dnode']

[tests/functional/gang_blocks]
tests = ['gang_blocks_redundant']
tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies']
tags = ['functional', 'gang_blocks']

[tests/functional/grow]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1560,6 +1560,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/features/large_dnode/large_dnode_009_pos.ksh \
functional/features/large_dnode/setup.ksh \
functional/gang_blocks/cleanup.ksh \
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
functional/gang_blocks/gang_blocks_redundant.ksh \
functional/gang_blocks/setup.ksh \
functional/grow/grow_pool_001_pos.ksh \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/ksh
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#

#
# Copyright (c) 2025 by Klara Inc.
#

#
# Description:
# Verify that mixed gang blocks and copies interact correctly in FDT
#
# Strategy:
# 1. Store a block with copies = 1 in the DDT unganged.
# 2. Add a new entry with copies = 2 that gangs, ensure it doesn't panic
# 3. Store a block with copies = 1 in the DDT ganged.
# 4. Add a new entry with copies = 3 that doesn't gang, ensure that it doesn't panic.
#

. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib

log_assert "Verify that mixed gang blocks and copies interact correctly in FDT"

save_tunable DEDUP_LOG_TXG_MAX

function cleanup2
{
zfs destroy $TESTPOOL/fs1
zfs destroy $TESTPOOL/fs2
restore_tunable DEDUP_LOG_TXG_MAX
cleanup
}

preamble
log_onexit cleanup2

log_must zpool create -f -o ashift=9 -o feature@block_cloning=disabled $TESTPOOL $DISKS
log_must zfs create -o recordsize=64k -o dedup=on $TESTPOOL/fs1
log_must zfs create -o recordsize=64k -o dedup=on -o copies=3 $TESTPOOL/fs2
set_tunable32 DEDUP_LOG_TXG_MAX 1
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
log_must sync_pool $TESTPOOL
set_tunable32 METASLAB_FORCE_GANGING 20000
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
Comment on lines +52 to +53
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: we should technically save_tunable/restore_tunable these two params as well.

log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
log_must sync_pool $TESTPOOL

log_must rm /$TESTPOOL/fs*/f1
log_must sync_pool $TESTPOOL
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
log_must sync_pool $TESTPOOL
log_must zdb -D $TESTPOOL
set_tunable32 METASLAB_FORCE_GANGING_PCT 0
log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
log_must sync_pool $TESTPOOL

log_must rm /$TESTPOOL/fs*/f1
log_must sync_pool $TESTPOOL
set_tunable32 METASLAB_FORCE_GANGING_PCT 50
set_tunable32 METASLAB_FORCE_GANGING 40000
log_must dd if=/dev/urandom of=/$TESTPOOL/f1 bs=64k count=1
for i in `seq 1 16`; do
log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs2/f1
log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs1/f1
log_must sync_pool $TESTPOOL
log_must zdb -D $TESTPOOL
done

log_pass "Verify that mixed gang blocks and copies interact correctly in FDT"
Loading