Skip to content

Commit 8898f4e

Browse files
committed
range_tree: Add zfs_recover_rt parameter and extra debug info
There are production cases where unexpected range tree segment adding/removal leads to panic. The root cause investigation requires more debug info about the range tree and the segments in question when it happens. In addition, the zfs_recover_rt parameter allows converting such panics into warnings with a potential space leak as a trade-off. Signed-off-by: Igor Ostapenko <[email protected]>
1 parent fe67499 commit 8898f4e

File tree

9 files changed

+263
-71
lines changed

9 files changed

+263
-71
lines changed

include/sys/range_tree.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,31 @@ typedef enum zfs_range_seg_type {
4848
ZFS_RANGE_SEG_NUM_TYPES,
4949
} zfs_range_seg_type_t;
5050

51+
/*
52+
* Use case flags to support the zfs_recover_rt parameter.
53+
*
54+
* The range tree's logic needs to know the context in order to correctly
55+
* recover from an unexpected situation by exchanging potential data loss for
56+
* a potential space leak:
57+
*
58+
* - If it knows that the tree represents allocated space then it should better
59+
* perform an unexpected addition to the tree.
60+
*
61+
* - Similarly, if it's about free space (aka allocatable) then it should
62+
* perform unexpected removals instead of silently ignoring the issue.
63+
*
64+
* An unknown case means to ignore unexpected additions/removals as a recovery
65+
* measure.
66+
*
67+
* In any case, unexpected actions are logged with extra details such as
68+
* a range tree instance string, which can be marked as dynamic to be freed
69+
* along with the tree instance destruction.
70+
*/
71+
#define ZFS_RANGE_TREE_UC_UNKNOWN (1 << 0)
72+
#define ZFS_RANGE_TREE_UC_ALLOCATED_SPACE (1 << 1)
73+
#define ZFS_RANGE_TREE_UC_FREE_SPACE (1 << 2)
74+
#define ZFS_RANGE_TREE_UC_DYN_INSTANCE (1 << 16)
75+
5176
/*
5277
* Note: the range_tree may not be accessed concurrently; consumers
5378
* must provide external locking if required.
@@ -67,6 +92,10 @@ typedef struct zfs_range_tree {
6792
void *rt_arg;
6893
uint64_t rt_gap; /* allowable inter-segment gap */
6994

95+
/* zfs_recover_rt support */
96+
uint64_t rt_use_case_flags;
97+
const char *rt_instance; /* details for debugging */
98+
7099
/*
71100
* The rt_histogram maintains a histogram of ranges. Each bucket,
72101
* rt_histogram[i], contains the number of ranges whose size is:
@@ -280,6 +309,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
280309
uint64_t gap);
281310
zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
282311
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
312+
zfs_range_tree_t *zfs_range_tree_create_usecase(const zfs_range_tree_ops_t *ops,
313+
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
314+
uint64_t use_case_flags, const char *instance);
283315
void zfs_range_tree_destroy(zfs_range_tree_t *rt);
284316
boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
285317
uint64_t size);

man/man4/zfs.4

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,6 +1987,12 @@ Set to attempt to recover from fatal errors.
19871987
This should only be used as a last resort,
19881988
as it typically results in leaked space, or worse.
19891989
.
1990+
.It Sy zfs_recover_rt Ns = Ns Sy 0 Ns | Ns 1 Pq int
1991+
Set to attempt to recover from fatal errors while adding or removing
1992+
unexpected segments to a range tree.
1993+
This should only be used as a last resort,
1994+
as it typically results in leaked space.
1995+
.
19901996
.It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
19911997
Ignore hard I/O errors during device removal.
19921998
When set, if a device encounters a hard I/O error during the removal process

module/zfs/dnode.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,8 +2435,10 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
24352435
{
24362436
int txgoff = tx->tx_txg & TXG_MASK;
24372437
if (dn->dn_free_ranges[txgoff] == NULL) {
2438-
dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
2439-
ZFS_RANGE_SEG64, NULL, 0, 0);
2438+
dn->dn_free_ranges[txgoff] =
2439+
zfs_range_tree_create_usecase(
2440+
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
2441+
ZFS_RANGE_TREE_UC_FREE_SPACE, "dn_free_ranges");
24402442
}
24412443
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
24422444
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);

module/zfs/metaslab.c

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,22 @@ static metaslab_stats_t metaslab_stats = {
368368
#define METASLABSTAT_BUMP(stat) \
369369
atomic_inc_64(&metaslab_stats.stat.value.ui64);
370370

371+
static inline char *
372+
metaslab_range_tree_instance(metaslab_group_t *mg, metaslab_t *ms,
373+
const char *rt_name)
374+
{
375+
const size_t len = 4 * ZFS_MAX_DATASET_NAME_LEN;
376+
char *buf = kmem_zalloc(len, KM_SLEEP);
377+
378+
snprintf(buf, len, "{spa=%s vdev_guid=%llu ms_id=%llu %s}",
379+
mg->mg_vd->vdev_spa->spa_name,
380+
(u_longlong_t)mg->mg_vd->vdev_guid,
381+
(u_longlong_t)ms->ms_id,
382+
rt_name);
383+
384+
return (buf);
385+
}
386+
371387

372388
static kstat_t *metaslab_ksp;
373389

@@ -2753,30 +2769,53 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
27532769
zfs_range_seg_type_t type =
27542770
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
27552771

2756-
ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
2757-
shift);
2772+
ms->ms_allocatable = zfs_range_tree_create_usecase(
2773+
NULL, type, NULL, start, shift,
2774+
ZFS_RANGE_TREE_UC_FREE_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2775+
metaslab_range_tree_instance(mg, ms, "ms_allocatable"));
27582776
for (int t = 0; t < TXG_SIZE; t++) {
2759-
ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
2760-
NULL, start, shift);
2761-
}
2762-
ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
2763-
ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
2777+
ms->ms_allocating[t] = zfs_range_tree_create_usecase(
2778+
NULL, type, NULL, start, shift,
2779+
ZFS_RANGE_TREE_UC_ALLOCATED_SPACE |
2780+
ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2781+
metaslab_range_tree_instance(mg, ms, "ms_allocating"));
2782+
}
2783+
ms->ms_freeing = zfs_range_tree_create_usecase(
2784+
NULL, type, NULL, start, shift,
2785+
ZFS_RANGE_TREE_UC_FREE_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2786+
metaslab_range_tree_instance(mg, ms, "ms_freeing"));
2787+
ms->ms_freed = zfs_range_tree_create_usecase(
2788+
NULL, type, NULL, start, shift,
2789+
ZFS_RANGE_TREE_UC_FREE_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2790+
metaslab_range_tree_instance(mg, ms, "ms_freed"));
27642791
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2765-
ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
2766-
start, shift);
2767-
}
2768-
ms->ms_checkpointing =
2769-
zfs_range_tree_create(NULL, type, NULL, start, shift);
2770-
ms->ms_unflushed_allocs =
2771-
zfs_range_tree_create(NULL, type, NULL, start, shift);
2792+
ms->ms_defer[t] = zfs_range_tree_create_usecase(
2793+
NULL, type, NULL, start, shift,
2794+
ZFS_RANGE_TREE_UC_FREE_SPACE |
2795+
ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2796+
metaslab_range_tree_instance(mg, ms, "ms_defer"));
2797+
}
2798+
ms->ms_checkpointing = zfs_range_tree_create_usecase(
2799+
NULL, type, NULL, start, shift,
2800+
ZFS_RANGE_TREE_UC_FREE_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2801+
metaslab_range_tree_instance(mg, ms, "ms_checkpointing"));
2802+
ms->ms_unflushed_allocs = zfs_range_tree_create_usecase(
2803+
NULL, type, NULL, start, shift,
2804+
ZFS_RANGE_TREE_UC_ALLOCATED_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2805+
metaslab_range_tree_instance(mg, ms, "ms_unflushed_allocs"));
27722806

27732807
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
27742808
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
27752809
mrap->mra_floor_shift = metaslab_by_size_min_shift;
2776-
ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
2777-
type, mrap, start, shift);
2810+
ms->ms_unflushed_frees = zfs_range_tree_create_usecase(
2811+
&metaslab_rt_ops, type, mrap, start, shift,
2812+
ZFS_RANGE_TREE_UC_FREE_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2813+
metaslab_range_tree_instance(mg, ms, "ms_unflushed_frees"));
27782814

2779-
ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
2815+
ms->ms_trim = zfs_range_tree_create_usecase(
2816+
NULL, type, NULL, start, shift,
2817+
ZFS_RANGE_TREE_UC_FREE_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
2818+
metaslab_range_tree_instance(mg, ms, "ms_trim"));
27802819

27812820
metaslab_group_add(mg, ms);
27822821
metaslab_set_fragmentation(ms, B_FALSE);
@@ -3750,7 +3789,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
37503789
type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
37513790
&start, &shift);
37523791

3753-
condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
3792+
condense_tree = zfs_range_tree_create_usecase(
3793+
NULL, type, NULL, start, shift,
3794+
ZFS_RANGE_TREE_UC_FREE_SPACE | ZFS_RANGE_TREE_UC_DYN_INSTANCE,
3795+
metaslab_range_tree_instance(msp->ms_group, msp, "condense_tree"));
37543796

37553797
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
37563798
zfs_range_tree_walk(msp->ms_defer[t],

0 commit comments

Comments
 (0)