Skip to content

Commit 17d00e8

Browse files
Moshe ShemeshSaeed Mahameed
Moshe Shemesh
authored and
Saeed Mahameed
committed
net/mlx5: Add command entry handling completion
When FW response to commands is very slow and all command entries in use are waiting for completion we can have a race where commands can get timeout before they get out of the queue and handled. Timeout completion on uninitialized command will cause releasing command's buffers before accessing it for initialization and then we will get NULL pointer exception while trying access it. It may also cause releasing buffers of another command since we may have timeout completion before even allocating entry index for this command. Add entry handling completion to avoid this race. Fixes: e126ba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Moshe Shemesh <[email protected]> Signed-off-by: Eran Ben Elisha <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 5a73015 commit 17d00e8

File tree

2 files changed

+15
-0
lines changed
  • drivers/net/ethernet/mellanox/mlx5/core
  • include/linux/mlx5

2 files changed

+15
-0
lines changed

drivers/net/ethernet/mellanox/mlx5/core/cmd.c

+14
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,7 @@ static void cmd_work_handler(struct work_struct *work)
861861
int alloc_ret;
862862
int cmd_mode;
863863

864+
complete(&ent->handling);
864865
sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
865866
down(sem);
866867
if (!ent->page_queue) {
@@ -978,19 +979,29 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
978979
struct mlx5_cmd *cmd = &dev->cmd;
979980
int err;
980981

982+
if (!wait_for_completion_timeout(&ent->handling, timeout) &&
983+
cancel_work_sync(&ent->work)) {
984+
ent->ret = -ECANCELED;
985+
goto out_err;
986+
}
981987
if (cmd->mode == CMD_MODE_POLLING || ent->polling) {
982988
wait_for_completion(&ent->done);
983989
} else if (!wait_for_completion_timeout(&ent->done, timeout)) {
984990
ent->ret = -ETIMEDOUT;
985991
mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
986992
}
987993

994+
out_err:
988995
err = ent->ret;
989996

990997
if (err == -ETIMEDOUT) {
991998
mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
992999
mlx5_command_str(msg_to_opcode(ent->in)),
9931000
msg_to_opcode(ent->in));
1001+
} else if (err == -ECANCELED) {
1002+
mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n",
1003+
mlx5_command_str(msg_to_opcode(ent->in)),
1004+
msg_to_opcode(ent->in));
9941005
}
9951006
mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
9961007
err, deliv_status_to_str(ent->status), ent->status);
@@ -1026,6 +1037,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
10261037
ent->token = token;
10271038
ent->polling = force_polling;
10281039

1040+
init_completion(&ent->handling);
10291041
if (!callback)
10301042
init_completion(&ent->done);
10311043

@@ -1045,6 +1057,8 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
10451057
err = wait_func(dev, ent);
10461058
if (err == -ETIMEDOUT)
10471059
goto out;
1060+
if (err == -ECANCELED)
1061+
goto out_free;
10481062

10491063
ds = ent->ts2 - ent->ts1;
10501064
op = MLX5_GET(mbox_in, in->first.data, opcode);

include/linux/mlx5/driver.h

+1
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,7 @@ struct mlx5_cmd_work_ent {
743743
struct delayed_work cb_timeout_work;
744744
void *context;
745745
int idx;
746+
struct completion handling;
746747
struct completion done;
747748
struct mlx5_cmd *cmd;
748749
struct work_struct work;

0 commit comments

Comments
 (0)