Skip to content

Commit

Permalink
gve: Add DQO QPL support
Browse files Browse the repository at this point in the history
DQO is the descriptor format for our next generation virtual NIC.
It is necessary to make full use of the hardware bandwidth on many
newer GCP VM shapes.

This patch extends the previously introduced DQO descriptor format
with a "QPL" mode. QPL stands for Queue Page List and refers to
the fact that the hardware cannot access arbitrary regions of the
host memory and instead expects a fixed bounce buffer comprising
of a list of pages.

The QPL aspects are similar to the already existing GQI queue
queue format: in that the mbufs being input in the Rx path have
external storage in the form of vm pages attached to them; and
in the Tx path we always copy the mbuf payload into QPL pages.

Signed-off-by: Shailend Chand <[email protected]>
  • Loading branch information
shailend-g authored and markjdb committed Nov 5, 2024
1 parent cbbd0f7 commit c44b080
Show file tree
Hide file tree
Showing 12 changed files with 983 additions and 146 deletions.
2 changes: 2 additions & 0 deletions share/man/man4/gve.4
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ The D in "DQO" refers to a newer generation of hardware, and the "QO"
stands for "Queue Out-of-order" referring to the fact that the NIC might
send Tx and Rx completions in an order different from the one in which
the corresponding descriptors were posted by the driver.
.It
DQO_QPL: The next generation descriptor format in the "QPL" mode.
.El
.Sh SUPPORT
Please email [email protected] with the specifics of the issue encountered.
Expand Down
101 changes: 95 additions & 6 deletions sys/dev/gve/gve.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ enum gve_queue_format {
GVE_GQI_RDA_FORMAT = 0x1,
GVE_GQI_QPL_FORMAT = 0x2,
GVE_DQO_RDA_FORMAT = 0x3,
GVE_DQO_QPL_FORMAT = 0x4,
};

enum gve_state_flags_bit {
Expand Down Expand Up @@ -226,18 +227,42 @@ struct gve_rxq_stats {
counter_u64_t rx_frag_flip_cnt;
counter_u64_t rx_frag_copy_cnt;
counter_u64_t rx_dropped_pkt_desc_err;
counter_u64_t rx_dropped_pkt_buf_post_fail;
counter_u64_t rx_dropped_pkt_mbuf_alloc_fail;
counter_u64_t rx_mbuf_dmamap_err;
counter_u64_t rx_mbuf_mclget_null;
};

#define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t))

union gve_rx_qpl_buf_id_dqo {
struct {
uint16_t buf_id:11; /* Index into rx->dqo.bufs */
uint8_t frag_num:5; /* Which frag in the QPL page */
};
uint16_t all;
} __packed;
_Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2,
"gve: bad dqo qpl rx buf id length");

struct gve_rx_buf_dqo {
struct mbuf *mbuf;
bus_dmamap_t dmamap;
uint64_t addr;
bool mapped;
union {
/* RDA */
struct {
struct mbuf *mbuf;
bus_dmamap_t dmamap;
uint64_t addr;
bool mapped;
};
/* QPL */
struct {
uint8_t num_nic_frags; /* number of pending completions */
uint8_t next_idx; /* index of the next frag to post */
/* for chaining rx->dqo.used_bufs */
STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry;
};
};
/* for chaining rx->dqo.free_bufs */
SLIST_ENTRY(gve_rx_buf_dqo) slist_entry;
};

Expand Down Expand Up @@ -276,6 +301,13 @@ struct gve_rx_ring {
uint32_t tail; /* The index at which to receive the next compl at */
uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */
SLIST_HEAD(, gve_rx_buf_dqo) free_bufs;

/*
* Only used in QPL mode. Pages refered to by if_input-ed mbufs
* stay parked here till their wire count comes back to 1.
* Pages are moved here after there aren't any pending completions.
*/
STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs;
} dqo;
};

Expand Down Expand Up @@ -313,6 +345,7 @@ struct gve_txq_stats {
counter_u64_t tx_dropped_pkt_nospace_bufring;
counter_u64_t tx_delayed_pkt_nospace_descring;
counter_u64_t tx_delayed_pkt_nospace_compring;
counter_u64_t tx_delayed_pkt_nospace_qpl_bufs;
counter_u64_t tx_delayed_pkt_tsoerr;
counter_u64_t tx_dropped_pkt_vlan;
counter_u64_t tx_mbuf_collapse;
Expand All @@ -326,7 +359,19 @@ struct gve_txq_stats {

struct gve_tx_pending_pkt_dqo {
struct mbuf *mbuf;
bus_dmamap_t dmamap;
union {
/* RDA */
bus_dmamap_t dmamap;
/* QPL */
struct {
/*
* A linked list of entries from qpl_bufs that served
* as the bounce buffer for this packet.
*/
int32_t qpl_buf_head;
uint32_t num_qpl_bufs;
};
};
uint8_t state; /* the gve_packet_state enum */
int next; /* To chain the free_pending_pkts lists */
};
Expand Down Expand Up @@ -377,7 +422,20 @@ struct gve_tx_ring {
*/
int32_t free_pending_pkts_csm;

bus_dma_tag_t buf_dmatag; /* DMA params for mapping Tx mbufs */
/*
* The head index of a singly linked list representing QPL page fragments
* to copy mbuf payload into for the NIC to see. Once this list is depleted,
* the "_prd" suffixed producer list, grown by the completion taskqueue,
* is stolen.
*
* Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
*/
int32_t free_qpl_bufs_csm;
uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */
uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */

/* DMA params for mapping Tx mbufs. Only used in RDA mode. */
bus_dma_tag_t buf_dmatag;
} __aligned(CACHE_LINE_SIZE);

/* Accessed when processing completions */
Expand All @@ -395,13 +453,35 @@ struct gve_tx_ring {
* its consumer list, with the "_csm" suffix, is depleted.
*/
int32_t free_pending_pkts_prd;

/*
* The completion taskqueue moves the QPL pages corresponding to a
* completed packet into this list. It is only used in QPL mode.
* The "_prd" denotes that this is a producer list. The trasnmit
* taskqueue steals this list once its consumer list, with the "_csm"
* suffix, is depleted.
*
* Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
*/
int32_t free_qpl_bufs_prd;
uint32_t qpl_bufs_produced;
} __aligned(CACHE_LINE_SIZE);

/* Accessed by both the completion and xmit loops */
struct {
/* completion tags index into this array */
struct gve_tx_pending_pkt_dqo *pending_pkts;
uint16_t num_pending_pkts;

/*
* Represents QPL page fragments. An index into this array
* always represents the same QPL page fragment. The value
* is also an index into this array and servers as a means
* to chain buffers into linked lists whose heads are
* either free_qpl_bufs_prd or free_qpl_bufs_csm or
* qpl_bufs_head.
*/
int32_t *qpl_bufs;
} __aligned(CACHE_LINE_SIZE);
} dqo;
};
Expand Down Expand Up @@ -531,6 +611,13 @@ gve_is_gqi(struct gve_priv *priv)
return (priv->queue_format == GVE_GQI_QPL_FORMAT);
}

static inline bool
gve_is_qpl(struct gve_priv *priv)
{
return (priv->queue_format == GVE_GQI_QPL_FORMAT ||
priv->queue_format == GVE_DQO_QPL_FORMAT);
}

/* Defined in gve_main.c */
void gve_schedule_reset(struct gve_priv *priv);

Expand All @@ -545,6 +632,7 @@ int gve_alloc_qpls(struct gve_priv *priv);
void gve_free_qpls(struct gve_priv *priv);
int gve_register_qpls(struct gve_priv *priv);
int gve_unregister_qpls(struct gve_priv *priv);
void gve_mextadd_free(struct mbuf *mbuf);

/* TX functions defined in gve_tx.c */
int gve_alloc_tx_rings(struct gve_priv *priv);
Expand All @@ -563,6 +651,7 @@ void gve_tx_free_ring_dqo(struct gve_priv *priv, int i);
void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i);
int gve_tx_intr_dqo(void *arg);
int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr);
int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf);
void gve_tx_cleanup_tq_dqo(void *arg, int pending);

/* RX functions defined in gve_rx.c */
Expand Down
38 changes: 36 additions & 2 deletions sys/dev/gve/gve_adminq.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void gve_parse_device_option(struct gve_priv *priv,
struct gve_device_option *option,
struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
struct gve_device_option_dqo_rda **dev_op_dqo_rda,
struct gve_device_option_dqo_qpl **dev_op_dqo_qpl,
struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
{
uint32_t req_feat_mask = be32toh(option->required_features_mask);
Expand Down Expand Up @@ -103,6 +104,23 @@ void gve_parse_device_option(struct gve_priv *priv,
*dev_op_dqo_rda = (void *)(option + 1);
break;

case GVE_DEV_OPT_ID_DQO_QPL:
if (option_length < sizeof(**dev_op_dqo_qpl) ||
req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL) {
device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT,
"DQO QPL", (int)sizeof(**dev_op_dqo_qpl),
GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL,
option_length, req_feat_mask);
break;
}

if (option_length > sizeof(**dev_op_dqo_qpl)) {
device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT,
"DQO QPL");
}
*dev_op_dqo_qpl = (void *)(option + 1);
break;

case GVE_DEV_OPT_ID_JUMBO_FRAMES:
if (option_length < sizeof(**dev_op_jumbo_frames) ||
req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) {
Expand Down Expand Up @@ -136,6 +154,7 @@ gve_process_device_options(struct gve_priv *priv,
struct gve_device_descriptor *descriptor,
struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
struct gve_device_option_dqo_rda **dev_op_dqo_rda,
struct gve_device_option_dqo_qpl **dev_op_dqo_qpl,
struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
{
char *desc_end = (char *)descriptor + be16toh(descriptor->total_length);
Expand All @@ -154,7 +173,10 @@ gve_process_device_options(struct gve_priv *priv,
}

gve_parse_device_option(priv, descriptor, dev_opt,
dev_op_gqi_qpl, dev_op_dqo_rda, dev_op_jumbo_frames);
dev_op_gqi_qpl,
dev_op_dqo_rda,
dev_op_dqo_qpl,
dev_op_jumbo_frames);
dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length));
}

Expand Down Expand Up @@ -387,6 +409,7 @@ gve_adminq_describe_device(struct gve_priv *priv)
struct gve_dma_handle desc_mem;
struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL;
struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL;
struct gve_device_option_dqo_qpl *dev_op_dqo_qpl = NULL;
struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL;
uint32_t supported_features_mask = 0;
int rc;
Expand Down Expand Up @@ -416,7 +439,9 @@ gve_adminq_describe_device(struct gve_priv *priv)
bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD);

rc = gve_process_device_options(priv, desc,
&dev_op_gqi_qpl, &dev_op_dqo_rda,
&dev_op_gqi_qpl,
&dev_op_dqo_rda,
&dev_op_dqo_qpl,
&dev_op_jumbo_frames);
if (rc != 0)
goto free_device_descriptor;
Expand All @@ -430,6 +455,15 @@ gve_adminq_describe_device(struct gve_priv *priv)
if (bootverbose)
device_printf(priv->dev,
"Driver is running with DQO RDA queue format.\n");
} else if (dev_op_dqo_qpl != NULL) {
snprintf(gve_queue_format, sizeof(gve_queue_format),
"%s", "DQO QPL");
priv->queue_format = GVE_DQO_QPL_FORMAT;
supported_features_mask = be32toh(
dev_op_dqo_qpl->supported_features_mask);
if (bootverbose)
device_printf(priv->dev,
"Driver is running with DQO QPL queue format.\n");
} else if (dev_op_gqi_qpl != NULL) {
snprintf(gve_queue_format, sizeof(gve_queue_format),
"%s", "GQI QPL");
Expand Down
14 changes: 13 additions & 1 deletion sys/dev/gve/gve_adminq.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,15 @@ struct gve_device_option_dqo_rda {
_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8,
"gve: bad admin queue struct length");

struct gve_device_option_dqo_qpl {
__be32 supported_features_mask;
__be16 tx_comp_ring_entries;
__be16 rx_buff_ring_entries;
};

_Static_assert(sizeof(struct gve_device_option_dqo_qpl) == 8,
"gve: bad admin queue struct length");

struct gve_device_option_modify_ring {
__be32 supported_features_mask;
__be16 max_rx_ring_size;
Expand All @@ -168,6 +177,7 @@ enum gve_dev_opt_id {
GVE_DEV_OPT_ID_GQI_QPL = 0x3,
GVE_DEV_OPT_ID_DQO_RDA = 0x4,
GVE_DEV_OPT_ID_MODIFY_RING = 0x6,
GVE_DEV_OPT_ID_DQO_QPL = 0x7,
GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8,
};

Expand All @@ -182,6 +192,7 @@ enum gve_dev_opt_req_feat_mask {
GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0,
};
Expand All @@ -196,7 +207,7 @@ enum gve_sup_feature_mask {
enum gve_driver_capability {
gve_driver_capability_gqi_qpl = 0,
gve_driver_capability_gqi_rda = 1,
gve_driver_capability_dqo_qpl = 2, /* reserved for future use */
gve_driver_capability_dqo_qpl = 2,
gve_driver_capability_dqo_rda = 3,
};

Expand All @@ -212,6 +223,7 @@ enum gve_driver_capability {
*/
#define GVE_DRIVER_CAPABILITY_FLAGS1 \
(GVE_CAP1(gve_driver_capability_gqi_qpl) | \
GVE_CAP1(gve_driver_capability_dqo_qpl) | \
GVE_CAP1(gve_driver_capability_dqo_rda))
#define GVE_DRIVER_CAPABILITY_FLAGS2 0x0
#define GVE_DRIVER_CAPABILITY_FLAGS3 0x0
Expand Down
17 changes: 16 additions & 1 deletion sys/dev/gve/gve_dqo.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,22 @@
* Start dropping RX fragments if at least these many
* buffers cannot be posted to the NIC.
*/
#define GVE_RX_DQO_MIN_PENDING_BUFS 32
#define GVE_RX_DQO_MIN_PENDING_BUFS 128

#define GVE_DQ_NUM_FRAGS_IN_PAGE (PAGE_SIZE / GVE_DEFAULT_RX_BUFFER_SIZE)

/*
* gve_rx_qpl_buf_id_dqo's 11 bit wide buf_id field limits the total
* number of pages per QPL to 2048.
*/
#define GVE_RX_NUM_QPL_PAGES_DQO 2048

/* 2K TX buffers for DQO-QPL */
#define GVE_TX_BUF_SHIFT_DQO 11
#define GVE_TX_BUF_SIZE_DQO BIT(GVE_TX_BUF_SHIFT_DQO)
#define GVE_TX_BUFS_PER_PAGE_DQO (PAGE_SIZE >> GVE_TX_BUF_SHIFT_DQO)

#define GVE_TX_NUM_QPL_PAGES_DQO 512

/* Basic TX descriptor (DTYPE 0x0C) */
struct gve_tx_pkt_desc_dqo {
Expand Down
Loading

0 comments on commit c44b080

Please sign in to comment.