Skip to content

Commit ee25c11

Browse files
amotinFedorUporovVstack
authored andcommitted
Fix deadlock on I/O errors during device removal
spa_vdev_remove_thread() should not hold svr_lock while loading a metaslab. It may block ZIO threads, required to handle metaslab loading, at least in case of read errors causing recovery writes. Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Rob Norris <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes openzfs#17145
1 parent 44144f6 commit ee25c11

File tree

1 file changed

+39
-19
lines changed

1 file changed

+39
-19
lines changed

module/zfs/vdev_removal.c

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,9 @@ spa_vdev_remove_thread(void *arg)
16211621
vca.vca_read_error_bytes = 0;
16221622
vca.vca_write_error_bytes = 0;
16231623

1624+
zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
1625+
NULL, 0, 0);
1626+
16241627
mutex_enter(&svr->svr_lock);
16251628

16261629
/*
@@ -1633,7 +1636,9 @@ spa_vdev_remove_thread(void *arg)
16331636
metaslab_t *msp = vd->vdev_ms[msi];
16341637
ASSERT3U(msi, <=, vd->vdev_ms_count);
16351638

1639+
again:
16361640
ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
1641+
mutex_exit(&svr->svr_lock);
16371642

16381643
mutex_enter(&msp->ms_sync_lock);
16391644
mutex_enter(&msp->ms_lock);
@@ -1646,36 +1651,49 @@ spa_vdev_remove_thread(void *arg)
16461651
}
16471652

16481653
/*
1649-
* If the metaslab has ever been allocated from (ms_sm!=NULL),
1654+
* If the metaslab has ever been synced (ms_sm != NULL),
16501655
* read the allocated segments from the space map object
16511656
* into svr_allocd_segs. Since we do this while holding
1652-
* svr_lock and ms_sync_lock, concurrent frees (which
1657+
* ms_lock and ms_sync_lock, concurrent frees (which
16531658
* would have modified the space map) will wait for us
16541659
* to finish loading the spacemap, and then take the
16551660
* appropriate action (see free_from_removing_vdev()).
16561661
*/
1657-
if (msp->ms_sm != NULL) {
1658-
VERIFY0(space_map_load(msp->ms_sm,
1659-
svr->svr_allocd_segs, SM_ALLOC));
1660-
1661-
zfs_range_tree_walk(msp->ms_unflushed_allocs,
1662-
zfs_range_tree_add, svr->svr_allocd_segs);
1663-
zfs_range_tree_walk(msp->ms_unflushed_frees,
1664-
zfs_range_tree_remove, svr->svr_allocd_segs);
1665-
zfs_range_tree_walk(msp->ms_freeing,
1666-
zfs_range_tree_remove, svr->svr_allocd_segs);
1662+
if (msp->ms_sm != NULL)
1663+
VERIFY0(space_map_load(msp->ms_sm, segs, SM_ALLOC));
16671664

1668-
/*
1669-
* When we are resuming from a paused removal (i.e.
1670-
* when importing a pool with a removal in progress),
1671-
* discard any state that we have already processed.
1672-
*/
1673-
zfs_range_tree_clear(svr->svr_allocd_segs, 0,
1674-
start_offset);
1665+
/*
1666+
* We could not hold svr_lock while loading space map, or we
1667+
* could hit deadlock in a ZIO pipeline, having to wait for
1668+
* it. But we can not block for it here under metaslab locks,
1669+
* or it would be a lock ordering violation.
1670+
*/
1671+
if (!mutex_tryenter(&svr->svr_lock)) {
1672+
mutex_exit(&msp->ms_lock);
1673+
mutex_exit(&msp->ms_sync_lock);
1674+
zfs_range_tree_vacate(segs, NULL, NULL);
1675+
mutex_enter(&svr->svr_lock);
1676+
goto again;
16751677
}
1678+
1679+
zfs_range_tree_swap(&segs, &svr->svr_allocd_segs);
1680+
zfs_range_tree_walk(msp->ms_unflushed_allocs,
1681+
zfs_range_tree_add, svr->svr_allocd_segs);
1682+
zfs_range_tree_walk(msp->ms_unflushed_frees,
1683+
zfs_range_tree_remove, svr->svr_allocd_segs);
1684+
zfs_range_tree_walk(msp->ms_freeing,
1685+
zfs_range_tree_remove, svr->svr_allocd_segs);
1686+
16761687
mutex_exit(&msp->ms_lock);
16771688
mutex_exit(&msp->ms_sync_lock);
16781689

1690+
/*
1691+
* When we are resuming from a paused removal (i.e.
1692+
* when importing a pool with a removal in progress),
1693+
* discard any state that we have already processed.
1694+
*/
1695+
zfs_range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
1696+
16791697
vca.vca_msp = msp;
16801698
zfs_dbgmsg("copying %llu segments for metaslab %llu",
16811699
(u_longlong_t)zfs_btree_numnodes(
@@ -1751,6 +1769,8 @@ spa_vdev_remove_thread(void *arg)
17511769

17521770
spa_config_exit(spa, SCL_CONFIG, FTAG);
17531771

1772+
zfs_range_tree_destroy(segs);
1773+
17541774
/*
17551775
* Wait for all copies to finish before cleaning up the vca.
17561776
*/

0 commit comments

Comments
 (0)