Skip to content

Commit

Permalink
gve: Fix TX livelock
Browse files Browse the repository at this point in the history
Before this change the transmit taskqueue would enqueue itself when it
cannot find space on the NIC ring with the hope that eventually space
would be made. This results in the following livelock that only occurs
after passing ~200Gbps of TCP traffic for many hours:

                        100% CPU
┌───────────┐wait on  ┌──────────┐         ┌───────────┐
│user thread│  cpu    │gve xmit  │wait on  │gve cleanup│
│with mbuf  ├────────►│taskqueue ├────────►│taskqueue  │
│uma lock   │         │          │ NIC ring│           │
└───────────┘         └──────────┘  space  └─────┬─────┘
     ▲                                           │
     │      wait on mbuf uma lock                │
     └───────────────────────────────────────────┘

Further details about the livelock are available on
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=281560.

After this change, the transmit taskqueue no longer spins till there is
room on the NIC ring. It instead stops itself and lets the
completion-processing taskqueue wake it up.

Since I'm touching the trasnmit taskqueue I've also corrected the name
of a counter and also fixed a bug where EINVAL mbufs were not being
freed and were instead living forever on the bufring.

Signed-off-by: Shailend Chand <[email protected]>
  • Loading branch information
shailend-g authored and markjdb committed Nov 5, 2024
1 parent c44b080 commit b8e723f
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 26 deletions.
3 changes: 2 additions & 1 deletion sys/dev/gve/gve.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ struct gve_txq_stats {
counter_u64_t tpackets;
counter_u64_t tso_packet_cnt;
counter_u64_t tx_dropped_pkt;
counter_u64_t tx_dropped_pkt_nospace_device;
counter_u64_t tx_delayed_pkt_nospace_device;
counter_u64_t tx_dropped_pkt_nospace_bufring;
counter_u64_t tx_delayed_pkt_nospace_descring;
counter_u64_t tx_delayed_pkt_nospace_compring;
Expand Down Expand Up @@ -383,6 +383,7 @@ struct gve_tx_ring {

struct task xmit_task;
struct taskqueue *xmit_tq;
bool stopped;

/* Accessed when writing descriptors */
struct buf_ring *br;
Expand Down
4 changes: 2 additions & 2 deletions sys/dev/gve/gve_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@
#include "gve_adminq.h"
#include "gve_dqo.h"

#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.0\n"
#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.1\n"
#define GVE_VERSION_MAJOR 1
#define GVE_VERSION_MINOR 3
#define GVE_VERSION_SUB 0
#define GVE_VERSION_SUB 1

#define GVE_DEFAULT_RX_COPYBREAK 256

Expand Down
6 changes: 3 additions & 3 deletions sys/dev/gve/gve_sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
"tx_bytes", CTLFLAG_RD,
&stats->tbytes, "Bytes transmitted");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_dropped_pkt_nospace_device", CTLFLAG_RD,
&stats->tx_dropped_pkt_nospace_device,
"Packets dropped due to no space in device");
"tx_delayed_pkt_nospace_device", CTLFLAG_RD,
&stats->tx_delayed_pkt_nospace_device,
"Packets delayed due to no space in device");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_dropped_pkt_nospace_bufring", CTLFLAG_RD,
&stats->tx_dropped_pkt_nospace_bufring,
Expand Down
98 changes: 78 additions & 20 deletions sys/dev/gve/gve_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ gve_start_tx_ring(struct gve_priv *priv, int i,
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;

atomic_store_bool(&tx->stopped, false);

NET_TASK_INIT(&com->cleanup_task, 0, cleanup, tx);
com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
taskqueue_thread_enqueue, &com->cleanup_tq);
Expand Down Expand Up @@ -427,6 +429,11 @@ gve_tx_cleanup_tq(void *arg, int pending)
gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
}

if (atomic_load_bool(&tx->stopped) && space_freed) {
atomic_store_bool(&tx->stopped, false);
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
}
}

static void
Expand Down Expand Up @@ -671,8 +678,7 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
if (__predict_false(!gve_can_tx(tx, bytes_required))) {
counter_enter();
counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1);
counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
counter_exit();
return (ENOBUFS);
}
Expand Down Expand Up @@ -733,6 +739,59 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
return (0);
}

static int
gve_xmit_mbuf(struct gve_tx_ring *tx,
struct mbuf **mbuf)
{
if (gve_is_gqi(tx->com.priv))
return (gve_xmit(tx, *mbuf));

if (gve_is_qpl(tx->com.priv))
return (gve_xmit_dqo_qpl(tx, *mbuf));

/*
* gve_xmit_dqo might attempt to defrag the mbuf chain.
* The reference is passed in so that in the case of
* errors, the new mbuf chain is what's put back on the br.
*/
return (gve_xmit_dqo(tx, mbuf));
}

/*
* Has the side-effect of stopping the xmit queue by setting tx->stopped
*/
static int
gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
struct mbuf **mbuf)
{
int err;

atomic_store_bool(&tx->stopped, true);

/*
* Room made in the queue BEFORE the barrier will be seen by the
* gve_xmit_mbuf retry below.
*
* If room is made in the queue AFTER the barrier, the cleanup tq
* iteration creating the room will either see a tx->stopped value
* of 0 or the 1 we just wrote:
*
* If it sees a 1, then it would enqueue the xmit tq. Enqueue
* implies a retry on the waiting pkt.
*
* If it sees a 0, then that implies a previous iteration overwrote
* our 1, and that iteration would enqueue the xmit tq. Enqueue
* implies a retry on the waiting pkt.
*/
atomic_thread_fence_seq_cst();

err = gve_xmit_mbuf(tx, mbuf);
if (err == 0)
atomic_store_bool(&tx->stopped, false);

return (err);
}

static void
gve_xmit_br(struct gve_tx_ring *tx)
{
Expand All @@ -743,24 +802,23 @@ gve_xmit_br(struct gve_tx_ring *tx)

while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
(mbuf = drbr_peek(ifp, tx->br)) != NULL) {
err = gve_xmit_mbuf(tx, &mbuf);

if (gve_is_gqi(priv))
err = gve_xmit(tx, mbuf);
else {
/*
* gve_xmit_dqo might attempt to defrag the mbuf chain.
* The reference is passed in so that in the case of
* errors, the new mbuf chain is what's put back on the br.
*/
if (gve_is_qpl(priv))
err = gve_xmit_dqo_qpl(tx, mbuf);
else
err = gve_xmit_dqo(tx, &mbuf);
}
/*
* We need to stop this taskqueue when we can't xmit the pkt due
* to lack of space in the NIC ring (ENOBUFS). The retry exists
* to guard against a TOCTTOU bug that could end up freezing the
* queue forever.
*/
if (__predict_false(mbuf != NULL && err == ENOBUFS))
err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);

if (__predict_false(err != 0 && mbuf != NULL)) {
drbr_putback(ifp, tx->br, mbuf);
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
if (err == EINVAL) {
drbr_advance(ifp, tx->br);
m_freem(mbuf);
} else
drbr_putback(ifp, tx->br, mbuf);
break;
}

Expand Down Expand Up @@ -827,7 +885,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
is_br_empty = drbr_empty(ifp, tx->br);
err = drbr_enqueue(ifp, tx->br, mbuf);
if (__predict_false(err != 0)) {
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
if (!atomic_load_bool(&tx->stopped))
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
counter_enter();
counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
Expand All @@ -842,9 +901,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
gve_xmit_br(tx);
GVE_RING_UNLOCK(tx);
} else {
} else if (!atomic_load_bool(&tx->stopped))
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
}

return (0);
}
Expand Down
10 changes: 10 additions & 0 deletions sys/dev/gve/gve_tx_dqo.c
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,16 @@ gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget)
work_done++;
}

/*
* Waking the xmit taskqueue has to occur after room has been made in
* the queue.
*/
atomic_thread_fence_seq_cst();
if (atomic_load_bool(&tx->stopped) && work_done) {
atomic_store_bool(&tx->stopped, false);
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
}

tx->done += work_done; /* tx->done is just a sysctl counter */
counter_enter();
counter_u64_add_protected(tx->stats.tbytes, bytes_done);
Expand Down

0 comments on commit b8e723f

Please sign in to comment.