Skip to content

Commit

Permalink
Merge tag 'reconstruct-defer-work-6.8_2023-12-06' of https://git.kern…
Browse files Browse the repository at this point in the history
…el.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.8-mergeA

xfs: log intent item recovery should reconstruct defer work state

Long Li reported a KASAN report from a UAF when intent recovery fails:

 ==================================================================
 BUG: KASAN: slab-use-after-free in xfs_cui_release+0xb7/0xc0
 Read of size 4 at addr ffff888012575e60 by task kworker/u8:3/103
 CPU: 3 PID: 103 Comm: kworker/u8:3 Not tainted 6.4.0-rc7-next-20230619-00003-g94543a53f9a4-dirty torvalds#166
 Workqueue: xfs-cil/sda xlog_cil_push_work
 Call Trace:
  <TASK>
  dump_stack_lvl+0x50/0x70
  print_report+0xc2/0x600
  kasan_report+0xb6/0xe0
  xfs_cui_release+0xb7/0xc0
  xfs_cud_item_release+0x3c/0x90
  xfs_trans_committed_bulk+0x2d5/0x7f0
  xlog_cil_committed+0xaba/0xf20
  xlog_cil_push_work+0x1a60/0x2360
  process_one_work+0x78e/0x1140
  worker_thread+0x58b/0xf60
  kthread+0x2cd/0x3c0
  ret_from_fork+0x1f/0x30
  </TASK>

 Allocated by task 531:
  kasan_save_stack+0x22/0x40
  kasan_set_track+0x25/0x30
  __kasan_slab_alloc+0x55/0x60
  kmem_cache_alloc+0x195/0x5f0
  xfs_cui_init+0x198/0x1d0
  xlog_recover_cui_commit_pass2+0x133/0x5f0
  xlog_recover_items_pass2+0x107/0x230
  xlog_recover_commit_trans+0x3e7/0x9c0
  xlog_recovery_process_trans+0x140/0x1d0
  xlog_recover_process_ophdr+0x1a0/0x3d0
  xlog_recover_process_data+0x108/0x2d0
  xlog_recover_process+0x1f6/0x280
  xlog_do_recovery_pass+0x609/0xdb0
  xlog_do_log_recovery+0x84/0xe0
  xlog_do_recover+0x7d/0x470
  xlog_recover+0x25f/0x490
  xfs_log_mount+0x2dd/0x6f0
  xfs_mountfs+0x11ce/0x1e70
  xfs_fs_fill_super+0x10ec/0x1b20
  get_tree_bdev+0x3c8/0x730
  vfs_get_tree+0x89/0x2c0
  path_mount+0xecf/0x1800
  do_mount+0xf3/0x110
  __x64_sys_mount+0x154/0x1f0
  do_syscall_64+0x39/0x80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd

 Freed by task 531:
  kasan_save_stack+0x22/0x40
  kasan_set_track+0x25/0x30
  kasan_save_free_info+0x2b/0x40
  __kasan_slab_free+0x114/0x1b0
  kmem_cache_free+0xf8/0x510
  xfs_cui_item_free+0x95/0xb0
  xfs_cui_release+0x86/0xc0
  xlog_recover_cancel_intents.isra.0+0xf8/0x210
  xlog_recover_finish+0x7e7/0x980
  xfs_log_mount_finish+0x2bb/0x4a0
  xfs_mountfs+0x14bf/0x1e70
  xfs_fs_fill_super+0x10ec/0x1b20
  get_tree_bdev+0x3c8/0x730
  vfs_get_tree+0x89/0x2c0
  path_mount+0xecf/0x1800
  do_mount+0xf3/0x110
  __x64_sys_mount+0x154/0x1f0
  do_syscall_64+0x39/0x80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd

 The buggy address belongs to the object at ffff888012575dc8
  which belongs to the cache xfs_cui_item of size 432
 The buggy address is located 152 bytes inside of
  freed 432-byte region [ffff888012575dc8, ffff888012575f78)

 The buggy address belongs to the physical page:
 page:ffffea0000495d00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888012576208 pfn:0x12574
 head:ffffea0000495d00 order:2 entire_mapcount:0 nr_pages_mapped:0 pincount:0
 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
 page_type: 0xffffffff()
 raw: 001fffff80010200 ffff888012092f40 ffff888014570150 ffff888014570150
 raw: ffff888012576208 00000000001e0010 00000001ffffffff 0000000000000000
 page dumped because: kasan: bad access detected

 Memory state around the buggy address:
  ffff888012575d00: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
  ffff888012575d80: fc fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb
 >ffff888012575e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                        ^
  ffff888012575e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff888012575f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc
 ==================================================================

"If process intents fails, intent items left in AIL will be delete
from AIL and freed in error handling, even intent items that have been
recovered and created done items. After this, uaf will be triggered when
done item committed, because at this point the released intent item will
be accessed.

xlog_recover_finish                     xlog_cil_push_work
----------------------------            ---------------------------
xlog_recover_process_intents
  xfs_cui_item_recover//cui_refcount == 1
    xfs_trans_get_cud
    xfs_trans_commit
      <add cud item to cil>
  xfs_cui_item_recover
    <error occurred and return>
xlog_recover_cancel_intents
  xfs_cui_release     //cui_refcount == 0
    xfs_cui_item_free //free cui
  <release other intent items>
xlog_force_shutdown   //shutdown
                               <...>
                                        <push items in cil>
                                        xlog_cil_committed
                                          xfs_cud_item_release
                                            xfs_cui_release // UAF

"Intent log items are created with a reference count of 2, one for the
creator, and one for the intent done object. Log recovery explicitly
drops the creator reference after it is inserted into the AIL, but it
then processes the log item as if it also owns the intent-done reference.

"The code in ->iop_recovery should assume that it passes the reference
to the done intent, we can remove the intent item from the AIL after
creating the done-intent, but if that code fails before creating the
done-intent then it needs to release the intent reference by log recovery
itself.

"That way when we go to cancel the intent, the only intents we find in
the AIL are the ones we know have not been processed yet and hence we
can safely drop both the creator and the intent done reference from
xlog_recover_cancel_intents().

"Hence if we remove the intent from the list of intents that need to
be recovered after we have done the initial recovery, we acheive two
things:

"1. the tail of the log can be moved forward with the commit of the
done intent or new intent to continue the operation, and

"2. We avoid the problem of trying to determine how many reference
counts we need to drop from intent recovery cancelling because we
never come across intents we've actually attempted recovery on."

Restated: The cause of the UAF is that xlog_recover_cancel_intents
thinks that it owns the refcount on any intent item in the AIL, and that
it's always safe to release these intent items.  This is not true after
the recovery function creates an log intent done item and points it at
the log intent item because releasing the done item always releases the
intent item.

The runtime defer ops code avoids all this by tracking both the log
intent and the intent done items, and releasing only the intent done
item if both have been created.  Long Li proposed fixing this by adding
state flags, but I have a more comprehensive fix.

First, observe that the latter half of the intent _recover functions are
nearly open-coded versions of the corresponding _finish_one function
that uses an onstack deferred work item to single-step through the item.

Second, notice that the recover function is not an exact match because
of the odd behavior that unfinished recovered work items are relogged
with separate log intent items instead of a single new log intent item,
which is what the defer ops machinery does.

Dave and I have long suspected that recovery should be reconstructing
the defer work state from what's in the recovered intent item.  Now we
finally have an excuse to refactor the code to do that.

This series starts by fixing a resource leak in LARP recovery.  We fix
the bug that Long Li reported by switching the intent recovery code to
construct chains of xfs_defer_pending objects and then using the defer
pending objects to track the intent/done item ownership.  Finally, we
clean up the code to reconstruct the exact incore state, which means we
can remove all the opencoded _recover code, which makes maintaining log
items much easier.

v2: minor changes per review comments
v3: pick up more rvb tags, fix build errors

This has been lightly tested with fstests.  Enjoy!

Signed-off-by: Darrick J. Wong <[email protected]>
Signed-off-by: Chandan Babu R <[email protected]>

* tag 'reconstruct-defer-work-6.8_2023-12-06' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: move ->iop_recover to xfs_defer_op_type
  xfs: use xfs_defer_finish_one to finish recovered work items
  xfs: dump the recovered xattri log item if corruption happens
  xfs: recreate work items when recovering intent items
  xfs: transfer recovered intent item ownership in ->iop_recover
  xfs: pass the xfs_defer_pending object to iop_recover
  xfs: use xfs_defer_pending objects to recover intent items
  xfs: don't leak recovered attri intent items
  • Loading branch information
Chandan Babu R committed Dec 7, 2023
2 parents 33cc938 + db7ccc0 commit 6b4ffe9
Show file tree
Hide file tree
Showing 12 changed files with 493 additions and 455 deletions.
127 changes: 96 additions & 31 deletions fs/xfs/libxfs/xfs_defer.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,23 +245,53 @@ xfs_defer_create_intents(
return ret;
}

STATIC void
static inline void
xfs_defer_pending_abort(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp)
{
const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];

trace_xfs_defer_pending_abort(mp, dfp);

if (dfp->dfp_intent && !dfp->dfp_done) {
ops->abort_intent(dfp->dfp_intent);
dfp->dfp_intent = NULL;
}
}

static inline void
xfs_defer_pending_cancel_work(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp)
{
const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
struct list_head *pwi;
struct list_head *n;

trace_xfs_defer_cancel_list(mp, dfp);

list_del(&dfp->dfp_list);
list_for_each_safe(pwi, n, &dfp->dfp_work) {
list_del(pwi);
dfp->dfp_count--;
trace_xfs_defer_cancel_item(mp, dfp, pwi);
ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
kmem_cache_free(xfs_defer_pending_cache, dfp);
}

STATIC void
xfs_defer_pending_abort_list(
struct xfs_mount *mp,
struct list_head *dop_list)
{
struct xfs_defer_pending *dfp;
const struct xfs_defer_op_type *ops;

/* Abort intent items that don't have a done item. */
list_for_each_entry(dfp, dop_list, dfp_list) {
ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_pending_abort(mp, dfp);
if (dfp->dfp_intent && !dfp->dfp_done) {
ops->abort_intent(dfp->dfp_intent);
dfp->dfp_intent = NULL;
}
}
list_for_each_entry(dfp, dop_list, dfp_list)
xfs_defer_pending_abort(mp, dfp);
}

/* Abort all the intents that were committed. */
Expand All @@ -271,7 +301,7 @@ xfs_defer_trans_abort(
struct list_head *dop_pending)
{
trace_xfs_defer_trans_abort(tp, _RET_IP_);
xfs_defer_pending_abort(tp->t_mountp, dop_pending);
xfs_defer_pending_abort_list(tp->t_mountp, dop_pending);
}

/*
Expand Down Expand Up @@ -389,27 +419,13 @@ xfs_defer_cancel_list(
{
struct xfs_defer_pending *dfp;
struct xfs_defer_pending *pli;
struct list_head *pwi;
struct list_head *n;
const struct xfs_defer_op_type *ops;

/*
* Free the pending items. Caller should already have arranged
* for the intent items to be released.
*/
list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_cancel_list(mp, dfp);
list_del(&dfp->dfp_list);
list_for_each_safe(pwi, n, &dfp->dfp_work) {
list_del(pwi);
dfp->dfp_count--;
trace_xfs_defer_cancel_item(mp, dfp, pwi);
ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
kmem_cache_free(xfs_defer_pending_cache, dfp);
}
list_for_each_entry_safe(dfp, pli, dop_list, dfp_list)
xfs_defer_pending_cancel_work(mp, dfp);
}

/*
Expand Down Expand Up @@ -468,7 +484,7 @@ xfs_defer_relog(
* Log an intent-done item for the first pending intent, and finish the work
* items.
*/
static int
int
xfs_defer_finish_one(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp)
Expand Down Expand Up @@ -660,9 +676,58 @@ xfs_defer_add(
list_add_tail(&dfp->dfp_list, &tp->t_dfops);
}

list_add_tail(li, &dfp->dfp_work);
xfs_defer_add_item(dfp, li);
trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
dfp->dfp_count++;
}

/*
* Create a pending deferred work item to replay the recovered intent item
* and add it to the list.
*/
void
xfs_defer_start_recovery(
struct xfs_log_item *lip,
enum xfs_defer_ops_type dfp_type,
struct list_head *r_dfops)
{
struct xfs_defer_pending *dfp;

dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
GFP_NOFS | __GFP_NOFAIL);
dfp->dfp_type = dfp_type;
dfp->dfp_intent = lip;
INIT_LIST_HEAD(&dfp->dfp_work);
list_add_tail(&dfp->dfp_list, r_dfops);
}

/*
* Cancel a deferred work item created to recover a log intent item. @dfp
* will be freed after this function returns.
*/
void
xfs_defer_cancel_recovery(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp)
{
xfs_defer_pending_abort(mp, dfp);
xfs_defer_pending_cancel_work(mp, dfp);
}

/* Replay the deferred work item created from a recovered log intent item. */
int
xfs_defer_finish_recovery(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
int error;

error = ops->recover_work(dfp, capture_list);
if (error)
trace_xlog_intent_recovery_failed(mp, error,
ops->recover_work);
return error;
}

/*
Expand Down Expand Up @@ -769,7 +834,7 @@ xfs_defer_ops_capture_abort(
{
unsigned short i;

xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops);
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);

for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
Expand Down
19 changes: 19 additions & 0 deletions fs/xfs/libxfs/xfs_defer.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
struct list_head *h);
int xfs_defer_finish_noroll(struct xfs_trans **tp);
int xfs_defer_finish(struct xfs_trans **tp);
int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
void xfs_defer_cancel(struct xfs_trans *);
void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);

Expand All @@ -56,6 +57,8 @@ struct xfs_defer_op_type {
void (*finish_cleanup)(struct xfs_trans *tp,
struct xfs_btree_cur *state, int error);
void (*cancel_item)(struct list_head *item);
int (*recover_work)(struct xfs_defer_pending *dfp,
struct list_head *capture_list);
unsigned int max_items;
};

Expand Down Expand Up @@ -125,6 +128,22 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);

void xfs_defer_start_recovery(struct xfs_log_item *lip,
enum xfs_defer_ops_type dfp_type, struct list_head *r_dfops);
void xfs_defer_cancel_recovery(struct xfs_mount *mp,
struct xfs_defer_pending *dfp);
int xfs_defer_finish_recovery(struct xfs_mount *mp,
struct xfs_defer_pending *dfp, struct list_head *capture_list);

static inline void
xfs_defer_add_item(
struct xfs_defer_pending *dfp,
struct list_head *work)
{
list_add_tail(work, &dfp->dfp_work);
dfp->dfp_count++;
}

int __init xfs_defer_init_item_caches(void);
void xfs_defer_destroy_item_caches(void);

Expand Down
7 changes: 7 additions & 0 deletions fs/xfs/libxfs/xfs_log_recover.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,11 @@ xlog_recover_resv(const struct xfs_trans_res *r)
return ret;
}

struct xfs_defer_pending;

void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip,
xfs_lsn_t lsn, unsigned int dfp_type);
int xlog_recover_finish_intent(struct xfs_trans *tp,
struct xfs_defer_pending *dfp);

#endif /* __XFS_LOG_RECOVER_H__ */
Loading

0 comments on commit 6b4ffe9

Please sign in to comment.