Skip to content

Commit 86b5f4c

Browse files
authored
Livelist logic should handle dedup blkptrs
Update the logic to handle the dedup-case of consecutive FREEs in the livelist code. The logic still ensures that all the FREE entries are matched up with a respective ALLOC by keeping a refcount for each FREE blkptr that we encounter and ensuring that this refcount gets to zero by the time we are done processing the livelist. zdb -y no longer panics when encountering double frees Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: John Kennedy <[email protected]> Reviewed-by: Don Brady <[email protected]> Signed-off-by: Serapheim Dimitropoulos <[email protected]> Closes #11480 Closes #12177
1 parent ea40012 commit 86b5f4c

File tree

4 files changed

+210
-64
lines changed

4 files changed

+210
-64
lines changed

cmd/zdb/zdb.c

Lines changed: 72 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,6 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
163163
dmu_tx_t *tx);
164164

165165
typedef struct sublivelist_verify {
166-
/* all ALLOC'd blkptr_t in one sub-livelist */
167-
zfs_btree_t sv_all_allocs;
168-
169-
/* all FREE'd blkptr_t in one sub-livelist */
170-
zfs_btree_t sv_all_frees;
171-
172166
/* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
173167
zfs_btree_t sv_pair;
174168

@@ -227,29 +221,68 @@ typedef struct sublivelist_verify_block {
227221

228222
static void zdb_print_blkptr(const blkptr_t *bp, int flags);
229223

224+
typedef struct sublivelist_verify_block_refcnt {
225+
/* block pointer entry in livelist being verified */
226+
blkptr_t svbr_blk;
227+
228+
/*
229+
* Refcount gets incremented to 1 when we encounter the first
230+
* FREE entry for the svfbr block pointer and a node for it
231+
* is created in our ZDB verification/tracking metadata.
232+
*
233+
* As we encounter more FREE entries we increment this counter
234+
* and similarly decrement it whenever we find the respective
235+
* ALLOC entries for this block.
236+
*
237+
* When the refcount gets to 0 it means that all the FREE and
238+
* ALLOC entries of this block have paired up and we no longer
239+
* need to track it in our verification logic (e.g. the node
240+
* containing this struct in our verification data structure
241+
* should be freed).
242+
*
243+
* [refer to sublivelist_verify_blkptr() for the actual code]
244+
*/
245+
uint32_t svbr_refcnt;
246+
} sublivelist_verify_block_refcnt_t;
247+
248+
static int
249+
sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
250+
{
251+
const sublivelist_verify_block_refcnt_t *l = larg;
252+
const sublivelist_verify_block_refcnt_t *r = rarg;
253+
return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
254+
}
255+
230256
static int
231257
sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
232258
dmu_tx_t *tx)
233259
{
234260
ASSERT3P(tx, ==, NULL);
235261
struct sublivelist_verify *sv = arg;
236-
char blkbuf[BP_SPRINTF_LEN];
262+
sublivelist_verify_block_refcnt_t current = {
263+
.svbr_blk = *bp,
264+
265+
/*
266+
* Start with 1 in case this is the first free entry.
267+
* This field is not used for our B-Tree comparisons
268+
* anyway.
269+
*/
270+
.svbr_refcnt = 1,
271+
};
272+
237273
zfs_btree_index_t where;
274+
sublivelist_verify_block_refcnt_t *pair =
275+
zfs_btree_find(&sv->sv_pair, &current, &where);
238276
if (free) {
239-
zfs_btree_add(&sv->sv_pair, bp);
240-
/* Check if the FREE is a duplicate */
241-
if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
242-
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
243-
free);
244-
(void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
277+
if (pair == NULL) {
278+
/* first free entry for this block pointer */
279+
zfs_btree_add(&sv->sv_pair, &current);
245280
} else {
246-
zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
281+
pair->svbr_refcnt++;
247282
}
248283
} else {
249-
/* Check if the ALLOC has been freed */
250-
if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
251-
zfs_btree_remove_idx(&sv->sv_pair, &where);
252-
} else {
284+
if (pair == NULL) {
285+
/* block that is currently marked as allocated */
253286
for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
254287
if (DVA_IS_EMPTY(&bp->blk_dva[i]))
255288
break;
@@ -264,49 +297,39 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
264297
&svb, &where);
265298
}
266299
}
267-
}
268-
/* Check if the ALLOC is a duplicate */
269-
if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
270-
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
271-
free);
272-
(void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
273300
} else {
274-
zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
301+
/* alloc matches a free entry */
302+
pair->svbr_refcnt--;
303+
if (pair->svbr_refcnt == 0) {
304+
/* all allocs and frees have been matched */
305+
zfs_btree_remove_idx(&sv->sv_pair, &where);
306+
}
275307
}
276308
}
309+
277310
return (0);
278311
}
279312

280313
static int
281314
sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
282315
{
283316
int err;
284-
char blkbuf[BP_SPRINTF_LEN];
285317
struct sublivelist_verify *sv = args;
286318

287-
zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
288-
sizeof (blkptr_t));
289-
290-
zfs_btree_create(&sv->sv_all_frees, livelist_compare,
291-
sizeof (blkptr_t));
292-
293-
zfs_btree_create(&sv->sv_pair, livelist_compare,
294-
sizeof (blkptr_t));
319+
zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
320+
sizeof (sublivelist_verify_block_refcnt_t));
295321

296322
err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
297323
sv, NULL);
298324

299-
zfs_btree_clear(&sv->sv_all_allocs);
300-
zfs_btree_destroy(&sv->sv_all_allocs);
301-
302-
zfs_btree_clear(&sv->sv_all_frees);
303-
zfs_btree_destroy(&sv->sv_all_frees);
304-
305-
blkptr_t *e;
325+
sublivelist_verify_block_refcnt_t *e;
306326
zfs_btree_index_t *cookie = NULL;
307327
while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
308-
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
309-
(void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
328+
char blkbuf[BP_SPRINTF_LEN];
329+
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
330+
&e->svbr_blk, B_TRUE);
331+
(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
332+
e->svbr_refcnt, blkbuf);
310333
}
311334
zfs_btree_destroy(&sv->sv_pair);
312335

@@ -615,10 +638,14 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
615638
/*
616639
* [Livelist Check]
617640
* Iterate through all the sublivelists and:
618-
* - report leftover frees
619-
* - report double ALLOCs/FREEs
641+
* - report leftover frees (**)
620642
* - record leftover ALLOCs together with their TXG [see Cross Check]
621643
*
644+
* (**) Note: Double ALLOCs are valid in datasets that have dedup
645+
* enabled. Similarly double FREEs are allowed as well but
646+
* only if they pair up with a corresponding ALLOC entry once
647+
* we our done with our sublivelist iteration.
648+
*
622649
* [Spacemap Check]
623650
* for each metaslab:
624651
* - iterate over spacemap and then the metaslab's entries in the

module/zfs/dsl_deadlist.c

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -909,15 +909,16 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
909909
}
910910

911911
typedef struct livelist_entry {
912-
const blkptr_t *le_bp;
912+
blkptr_t le_bp;
913+
uint32_t le_refcnt;
913914
avl_node_t le_node;
914915
} livelist_entry_t;
915916

916917
static int
917918
livelist_compare(const void *larg, const void *rarg)
918919
{
919-
const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
920-
const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
920+
const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
921+
const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;
921922

922923
/* Sort them according to dva[0] */
923924
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
@@ -944,6 +945,11 @@ struct livelist_iter_arg {
944945
* Expects an AVL tree which is incrementally filled will FREE blkptrs
945946
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
946947
* corresponding FREE are stored in the supplied bplist.
948+
*
949+
* Note that multiple FREE and ALLOC entries for the same blkptr may
950+
* be encountered when dedup is involved. For this reason we keep a
951+
* refcount for all the FREE entries of each blkptr and ensure that
952+
* each of those FREE entries has a corresponding ALLOC preceding it.
947953
*/
948954
static int
949955
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
@@ -957,23 +963,47 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
957963

958964
if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
959965
return (SET_ERROR(EINTR));
966+
967+
livelist_entry_t node;
968+
node.le_bp = *bp;
969+
livelist_entry_t *found = avl_find(avl, &node, NULL);
960970
if (bp_freed) {
961-
livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
962-
KM_SLEEP);
963-
blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
964-
*temp_bp = *bp;
965-
node->le_bp = temp_bp;
966-
avl_add(avl, node);
967-
} else {
968-
livelist_entry_t node;
969-
node.le_bp = bp;
970-
livelist_entry_t *found = avl_find(avl, &node, NULL);
971-
if (found != NULL) {
972-
avl_remove(avl, found);
973-
kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
974-
kmem_free(found, sizeof (livelist_entry_t));
971+
if (found == NULL) {
972+
/* first free entry for this blkptr */
973+
livelist_entry_t *e =
974+
kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
975+
e->le_bp = *bp;
976+
e->le_refcnt = 1;
977+
avl_add(avl, e);
975978
} else {
979+
/* dedup block free */
980+
ASSERT(BP_GET_DEDUP(bp));
981+
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
982+
BP_GET_CHECKSUM(&found->le_bp));
983+
ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
984+
found->le_refcnt++;
985+
}
986+
} else {
987+
if (found == NULL) {
988+
/* block is currently marked as allocated */
976989
bplist_append(to_free, bp);
990+
} else {
991+
/* alloc matches a free entry */
992+
ASSERT3U(found->le_refcnt, !=, 0);
993+
found->le_refcnt--;
994+
if (found->le_refcnt == 0) {
995+
/* all tracked free pairs have been matched */
996+
avl_remove(avl, found);
997+
kmem_free(found, sizeof (livelist_entry_t));
998+
} else {
999+
/*
1000+
* This is definitely a deduped blkptr so
1001+
* let's validate it.
1002+
*/
1003+
ASSERT(BP_GET_DEDUP(bp));
1004+
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
1005+
BP_GET_CHECKSUM(&found->le_bp));
1006+
}
9771007
}
9781008
}
9791009
return (0);
@@ -999,6 +1029,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
9991029
};
10001030
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
10011031

1032+
VERIFY0(avl_numnodes(&avl));
10021033
avl_destroy(&avl);
10031034
return (err);
10041035
}

tests/runfiles/common.run

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,8 @@ tags = ['functional', 'cli_root', 'zfs_create']
166166

167167
[tests/functional/cli_root/zfs_destroy]
168168
tests = ['zfs_clone_livelist_condense_and_disable',
169-
'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
170-
'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
169+
'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
170+
'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
171171
'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
172172
'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
173173
'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/bin/ksh -p
2+
#
3+
# This file and its contents are supplied under the terms of the
4+
# Common Development and Distribution License ("CDDL"), version 1.0.
5+
# You may only use this file in accordance with the terms of version
6+
# 1.0 of the CDDL.
7+
#
8+
# A full copy of the text of the CDDL should have accompanied this
9+
# source. A copy of the CDDL is also available via the Internet at
10+
# http://www.illumos.org/license/CDDL.
11+
#
12+
13+
#
14+
# Copyright (c) 2021 by Delphix. All rights reserved.
15+
#
16+
17+
# DESCRIPTION
18+
# Verify zfs destroy test for clones with livelists that contain
19+
# dedup blocks. This test is a baseline regression test created
20+
# to ensure that past bugs that we've encountered between dedup
21+
# and the livelist logic don't resurface.
22+
23+
# STRATEGY
24+
# 1. Create a clone from a test filesystem and enable dedup.
25+
# 2. Write some data and create a livelist.
26+
# 3. Copy the data within the clone to create dedup blocks.
27+
# 4. Remove some of the dedup data to create multiple free
28+
# entries for the same block pointers.
29+
# 5. Process all the livelist entries by destroying the clone.
30+
31+
. $STF_SUITE/include/libtest.shlib
32+
. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
33+
34+
function cleanup
35+
{
36+
log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
37+
# Reset the minimum percent shared to 75
38+
set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED
39+
}
40+
41+
function test_dedup
42+
{
43+
# Set a small percent shared threshold so the livelist is not disabled
44+
set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10
45+
clone_dataset $TESTFS1 snap $TESTCLONE
46+
47+
# Enable dedup
48+
log_must zfs set dedup=on $TESTPOOL/$TESTCLONE
49+
50+
# Create some data to be deduped
51+
log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k
52+
53+
# Create dedup blocks
54+
# Note: We sync before and after so all dedup blocks belong to the
55+
# same TXG, otherwise they won't look identical to the livelist
56+
# iterator due to their logical birth TXG being different.
57+
log_must zpool sync $TESTPOOL
58+
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0
59+
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1
60+
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2
61+
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3
62+
log_must zpool sync $TESTPOOL
63+
check_livelist_exists $TESTCLONE
64+
65+
# Introduce "double frees"
66+
# We want to introduce consecutive FREEs of the same block as this
67+
# was what triggered past panics.
68+
# Note: Similarly to the previouys step we sync before and after our
69+
# our deletions so all the entries end up in the same TXG.
70+
log_must zpool sync $TESTPOOL
71+
log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2
72+
log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3
73+
log_must zpool sync $TESTPOOL
74+
check_livelist_exists $TESTCLONE
75+
76+
log_must zfs destroy $TESTPOOL/$TESTCLONE
77+
check_livelist_gone
78+
}
79+
80+
ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED)
81+
82+
log_onexit cleanup
83+
log_must zfs create $TESTPOOL/$TESTFS1
84+
log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile
85+
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
86+
test_dedup
87+
88+
log_pass "Clone's livelist processes dedup blocks as expected."

0 commit comments

Comments
 (0)