Skip to content

Commit

Permalink
arc: avoid possible deadlock in arc_read
Browse files Browse the repository at this point in the history
In l2arc_evict(), the config lock may be acquired in reverse order
(e.g., first the config lock (writer), then a hash lock) unlike in
arc_read() during scenarios like L2ARC device removal. To avoid
deadlocks, if the attempt to acquire the config lock (reader) fails
in arc_read(), release the hash lock, wait for the config lock, and
retry from the beginning.

Signed-off-by: Ameer Hamza <[email protected]>
  • Loading branch information
ixhamza committed Feb 20, 2025
1 parent 6a2f7b3 commit 28f9c2a
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 16 deletions.
4 changes: 2 additions & 2 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -9043,7 +9043,7 @@ zdb_read_block(char *thing, spa_t *spa)
const blkptr_t *b = (const blkptr_t *)(void *)
((uintptr_t)buf + (uintptr_t)blkptr_offset);
if (zfs_blkptr_verify(spa, b,
BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) {
abd_return_buf_copy(pabd, buf, lsize);
borrowed = B_FALSE;
buf = lbuf;
Expand All @@ -9052,7 +9052,7 @@ zdb_read_block(char *thing, spa_t *spa)
b = (const blkptr_t *)(void *)
((uintptr_t)buf + (uintptr_t)blkptr_offset);
if (lsize == -1 || zfs_blkptr_verify(spa, b,
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
printf("invalid block pointer at this DVA\n");
goto out;
}
Expand Down
3 changes: 2 additions & 1 deletion include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ enum blk_verify_flag {
enum blk_config_flag {
BLK_CONFIG_HELD, // SCL_VDEV held for writer
BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader
BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV
};

Expand Down Expand Up @@ -663,7 +664,7 @@ extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
extern int zio_resume(spa_t *spa);
extern void zio_resume_wait(spa_t *spa);

extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);

/*
Expand Down
26 changes: 22 additions & 4 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -5683,6 +5683,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
arc_buf_t *buf = NULL;
int rc = 0;
boolean_t bp_validation = B_FALSE;

ASSERT(!embedded_bp ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
Expand Down Expand Up @@ -5725,7 +5726,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
* should always be the case since the blkptr is protected by
* a checksum.
*/
if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
BLK_VERIFY_LOG)) {
mutex_exit(hash_lock);
rc = SET_ERROR(ECKSUM);
Expand Down Expand Up @@ -5877,6 +5878,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
abd_t *hdr_abd;
int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
int config_lock;
int error;

if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
if (hash_lock != NULL)
Expand All @@ -5885,16 +5888,31 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
goto done;
}

if (zio_flags & ZIO_FLAG_CONFIG_WRITER) {
config_lock = BLK_CONFIG_HELD;
} else if (hash_lock != NULL) {
/*
* Prevent lock order reversal
*/
config_lock = BLK_CONFIG_NEEDED_TRY;
} else {
config_lock = BLK_CONFIG_NEEDED;
}

/*
* Verify the block pointer contents are reasonable. This
* should always be the case since the blkptr is protected by
* a checksum.
*/
if (!zfs_blkptr_verify(spa, bp,
(zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
if (!bp_validation && (error = zfs_blkptr_verify(spa, bp,
config_lock, BLK_VERIFY_LOG))) {
if (hash_lock != NULL)
mutex_exit(hash_lock);
if (error == EBUSY && !zfs_blkptr_verify(spa, bp,
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
bp_validation = B_TRUE;
goto top;
}
rc = SET_ERROR(ECKSUM);
goto done;
}
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -2305,7 +2305,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
DMU_USERUSED_OBJECT, tx);
}
arc_buf_destroy(buf, &buf);
} else if (!zfs_blkptr_verify(spa, bp,
} else if (zfs_blkptr_verify(spa, bp,
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
/*
* Sanity check the block pointer contents, this is handled
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -2778,7 +2778,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
* When damaged consider it to be a metadata error since we cannot
* trust the BP_GET_TYPE and BP_GET_LEVEL values.
*/
if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
atomic_inc_64(&sle->sle_meta_count);
return (0);
}
Expand Down
19 changes: 12 additions & 7 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
* it only contains known object types, checksum/compression identifiers,
* block sizes within the maximum allowed limits, valid DVAs, etc.
*
* If everything checks out B_TRUE is returned. The zfs_blkptr_verify
* If everything checks out 0 is returned. The zfs_blkptr_verify
* argument controls the behavior when an invalid field is detected.
*
* Values for blk_verify_flag:
Expand All @@ -1179,7 +1179,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
* BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
* performance
*/
boolean_t
int
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
{
Expand Down Expand Up @@ -1211,7 +1211,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
"blkptr at %px has invalid PSIZE %llu",
bp, (longlong_t)BPE_GET_PSIZE(bp));
}
return (errors == 0);
return (errors ? ECKSUM : 0);
}
if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
Expand All @@ -1229,7 +1229,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
* will be done once the zio is executed in vdev_mirror_map_alloc.
*/
if (unlikely(!spa->spa_trust_config))
return (errors == 0);
return (errors ? ECKSUM : 0);

switch (blk_config) {
case BLK_CONFIG_HELD:
Expand All @@ -1238,8 +1238,12 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
case BLK_CONFIG_NEEDED:
spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
break;
case BLK_CONFIG_NEEDED_TRY:
if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER))
return (EBUSY);
break;
case BLK_CONFIG_SKIP:
return (errors == 0);
return (errors ? ECKSUM : 0);
default:
panic("invalid blk_config %u", blk_config);
}
Expand Down Expand Up @@ -1294,10 +1298,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
bp, i, (longlong_t)offset);
}
}
if (blk_config == BLK_CONFIG_NEEDED)
if (blk_config == BLK_CONFIG_NEEDED || blk_config ==
BLK_CONFIG_NEEDED_TRY)
spa_config_exit(spa, SCL_VDEV, bp);

return (errors == 0);
return (errors ? ECKSUM : 0);
}

boolean_t
Expand Down

0 comments on commit 28f9c2a

Please sign in to comment.