Merge pull request torvalds#184 from hkchu/offload

lkl: Add offload (TSO4, CSUM) support to LKL device, #2 of 2
laijs · Aug 3, 2016 · bafaea1 · bafaea1
2 parents 59c55ff + dd9bbf6
commit bafaea1
Show file tree

Hide file tree

Showing 4 changed files with 173 additions and 48 deletions.
diff --git a/tools/lkl/lib/virtio.c b/tools/lkl/lib/virtio.c
@@ -1,5 +1,6 @@
 #include <string.h>
 #include <stdio.h>
+#include <stdbool.h>
 #include <lkl_host.h>
 #include <lkl/linux/virtio_ring.h>
 #include "iomem.h"
@@ -70,10 +71,32 @@ void virtio_req_complete(struct virtio_req *req, uint32_t len)
 	struct virtio_queue *q = req->q;
 	struct virtio_dev *dev = req->dev;
 	uint16_t idx = le16toh(q->used->idx) & (q->num - 1);
-	uint16_t new = le16toh(q->used->idx) + 1;
+	uint16_t new;
 	int send_irq = 0;
+	int avail_used;
 
 	q->used->ring[idx].id = htole16(req->idx);
+	if (req->mergeable_rx_len == 0) {
+		new = le16toh(q->used->idx) + 1;
+		avail_used = 1;
+	} else {
+		/* we've potentially used up multiple (non-chained)
+		 * descriptors and have to create one "used" entry for
+		 * each descr we've consumed.
+		 */
+		int i = 0, last_idx = q->last_avail_idx, req_idx;
+
+		avail_used = req->buf_count;
+		new = le16toh(q->used->idx) + req->buf_count;
+		while (i < req->buf_count-1) {
+			q->used->ring[idx].len = htole16(req->buf[i].len);
+			len -= req->buf[i].len;
+			idx++; i++; last_idx++;
+			idx &= (q->num - 1);
+			req_idx = q->avail->ring[last_idx & (q->num - 1)];
+			q->used->ring[idx].id = htole16(req_idx);
+		}
+	}
 	q->used->ring[idx].len = htole16(len);
 	/* Make sure all memory writes before are visible to the driver before
 	 * updating the idx.
@@ -86,9 +109,9 @@ void virtio_req_complete(struct virtio_req *req, uint32_t len)
 
 	/* Triggers the irq whenever there is no available buffer.
 	 * q->last_avail_idx is incremented after calling virtio_req_complete(),
-	 * so here we need to add one to it.
+	 * so here we need to add avail_used to it.
 	 */
-	if (q->last_avail_idx + 1 == q->avail->idx)
+	if (q->last_avail_idx + avail_used == q->avail->idx)
 		send_irq = 1;
 
 	/* There are two rings: q->avail and q->used for each of the rx and tx
@@ -148,35 +171,68 @@ static void init_dev_buf_from_vring_desc(struct lkl_dev_buf *buf,
 		bad_driver("bad vring_desc\n");
 }
 
+/*
+ * Below there are two distinctly different (per packet) buffer allocation
+ * schemes for us to deal with:
+ *
+ * 1. One or more descriptors chained through "next" as indicated by the
+ *    LKL_VRING_DESC_F_NEXT flag,
+ * 2. One or more descriptors from the ring sequentially, as many as are
+ *    available and needed. This is the RX only "mergeable_rx_bufs" mode.
+ *    The mode is entered when the VIRTIO_NET_F_MRG_RXBUF device feature
+ *    is enabled.
+ */
 static int virtio_process_one(struct virtio_dev *dev, struct virtio_queue *q,
-			      int idx)
+			      int idx, bool is_mergeable_rx)
 {
 	int q_buf_cnt = 0, ret = -1;
 	struct virtio_req req = {
 		.dev = dev,
 		.q = q,
 		.idx = q->avail->ring[idx & (q->num - 1)],
+		.mergeable_rx_len = 0,
 	};
 	uint16_t prev_flags = LKL_VRING_DESC_F_NEXT;
 	struct lkl_vring_desc *curr_vring_desc = vring_desc_at_le_idx(q, req.idx);
 
-	while ((prev_flags & LKL_VRING_DESC_F_NEXT) &&
-		(q_buf_cnt < VIRTIO_REQ_MAX_BUFS)) {
-		prev_flags = le16toh(curr_vring_desc->flags);
-		init_dev_buf_from_vring_desc(&req.buf[q_buf_cnt++], curr_vring_desc);
-		curr_vring_desc = vring_desc_at_le_idx(q, curr_vring_desc->next);
-	}
-
-	/* Somehow, we've built a request that's too long to fit onto our device */
-	if (q_buf_cnt == VIRTIO_REQ_MAX_BUFS &&
-		(prev_flags & LKL_VRING_DESC_F_NEXT))
-		bad_driver("enqueued too many request bufs");
+	if (is_mergeable_rx) {
+		int len = 0, desc_idx;
 
+		/* We may receive upto 64KB TSO packet so collect as many
+		 * descriptors as there are available upto 64KB in total len.
+		 */
+		while ((len < 65535) && (q_buf_cnt < VIRTIO_REQ_MAX_BUFS)) {
+			init_dev_buf_from_vring_desc(
+			    &req.buf[q_buf_cnt], curr_vring_desc);
+			len += req.buf[q_buf_cnt++].len;
+			if (++idx == le16toh(q->avail->idx))
+				break;
+			desc_idx = q->avail->ring[idx & (q->num - 1)];
+			curr_vring_desc = vring_desc_at_le_idx(q, desc_idx);
+		}
+		req.mergeable_rx_len = len;
+	} else {
+		while ((prev_flags & LKL_VRING_DESC_F_NEXT) &&
+			(q_buf_cnt < VIRTIO_REQ_MAX_BUFS)) {
+			prev_flags = le16toh(curr_vring_desc->flags);
+			init_dev_buf_from_vring_desc(
+			    &req.buf[q_buf_cnt++], curr_vring_desc);
+			curr_vring_desc =
+			    vring_desc_at_le_idx(q, curr_vring_desc->next);
+		}
+		/* Somehow we've built a request too long to fit our device */
+		if (q_buf_cnt == VIRTIO_REQ_MAX_BUFS &&
+			(prev_flags & LKL_VRING_DESC_F_NEXT))
+			bad_driver("enqueued too many request bufs");
+	}
 	req.buf_count = q_buf_cnt;
 	ret = dev->ops->enqueue(dev, &req);
 	if (ret < 0)
 		return ret;
-	q->last_avail_idx++;
+	if (is_mergeable_rx)
+		q->last_avail_idx += ret;
+	else
+		q->last_avail_idx++;
 	return 0;
 }
 
@@ -200,18 +256,24 @@ static int virtio_process_one(struct virtio_dev *dev, struct virtio_queue *q,
 void virtio_process_queue(struct virtio_dev *dev, uint32_t qidx)
 {
 	struct virtio_queue *q = &dev->queue[qidx];
+	bool is_mergeable_rx;
 
 	if (!q->ready)
 		return;
 
 	if (dev->ops->acquire_queue)
 		dev->ops->acquire_queue(dev, qidx);
 
+	is_mergeable_rx = ((dev->device_id == LKL_VIRTIO_ID_NET) &&
+	    is_rx_queue(dev, q) &&
+	    (dev->device_features & BIT(LKL_VIRTIO_NET_F_MRG_RXBUF)));
+
 	while (q->last_avail_idx != le16toh(q->avail->idx)) {
 		/* Make sure following loads happens after loading q->avail->idx.
 		 */
 		__sync_synchronize();
-		if (virtio_process_one(dev, q, q->last_avail_idx) < 0)
+		if (virtio_process_one(dev, q, q->last_avail_idx,
+		    is_mergeable_rx) < 0)
 			break;
 		if (q->last_avail_idx == le16toh(q->avail->idx))
 			virtio_set_avail_event(q, q->avail->idx);

diff --git a/tools/lkl/lib/virtio.h b/tools/lkl/lib/virtio.h
@@ -4,14 +4,28 @@
 #include <stdint.h>
 #include <lkl_host.h>
 
-#define VIRTIO_REQ_MAX_BUFS	4
+#define PAGE_SIZE		4096
+
+/* The following are copied from skbuff.h */
+#if (65536/PAGE_SIZE + 1) < 16
+#define MAX_SKB_FRAGS 16UL
+#else
+#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
+#endif
+
+#define VIRTIO_REQ_MAX_BUFS	(MAX_SKB_FRAGS + 2)
+
+/* We always have 2 queues on a netdev: one for tx, one for rx. */
+#define RX_QUEUE_IDX 0
+#define TX_QUEUE_IDX 1
 
 struct virtio_req {
 	struct virtio_dev *dev;
 	struct virtio_queue *q;
 	uint16_t idx;
 	uint16_t buf_count;
 	struct lkl_dev_buf buf[VIRTIO_REQ_MAX_BUFS];
+	uint32_t mergeable_rx_len;
 };
 
 struct virtio_dev_ops {
@@ -21,6 +35,9 @@ struct virtio_dev_ops {
 	 * the current request is not consumed from the queue and the host
 	 * device is resposible for restaring the queue processing by calling
 	 * virtio_process_queue at a later time.
+	 * A special case exists if a netdev is in mergeable RX buffer mode
+	 * where more than one "avail" slots may be consumed. In this case
+	 * it will return how many avail idx to advance.
 	 */
 	int (*enqueue)(struct virtio_dev *dev, struct virtio_req *req);
 	/* Acquire/release a lock on the specified queue. Only
@@ -71,4 +88,17 @@ void virtio_process_queue(struct virtio_dev *dev, uint32_t qidx);
 #define container_of(ptr, type, member) \
 	(type *)((char *)(ptr) - __builtin_offsetof(type, member))
 
+
+static inline int is_rx_queue(struct virtio_dev *dev,
+			      struct virtio_queue *queue)
+{
+	return &dev->queue[RX_QUEUE_IDX] == queue;
+}
+
+static inline int is_tx_queue(struct virtio_dev *dev,
+			      struct virtio_queue *queue)
+{
+	return &dev->queue[TX_QUEUE_IDX] == queue;
+}
+
 #endif /* _LKL_LIB_VIRTIO_H */
diff --git a/tools/lkl/lib/virtio_net.c b/tools/lkl/lib/virtio_net.c
@@ -8,11 +8,8 @@
 #define netdev_of(x) (container_of(x, struct virtio_net_dev, dev))
 #define BIT(x) (1ULL << x)
 
-/* We always have 2 queues on a netdev: one for tx, one for rx. */
-#define RX_QUEUE_IDX 0
-#define TX_QUEUE_IDX 1
 #define NUM_QUEUES (TX_QUEUE_IDX + 1)
-#define QUEUE_DEPTH 32
+#define QUEUE_DEPTH 128
 
 /* In fact, we'll hit the limit on the devs string below long before
  * we hit this, but it's good enough for now. */
@@ -59,51 +56,82 @@ static void net_release_queue(struct virtio_dev *dev, int queue_idx)
 	lkl_host_ops.mutex_unlock(netdev_of(dev)->queue_locks[queue_idx]);
 }
 
-static inline int is_rx_queue(struct virtio_dev *dev, struct virtio_queue *queue)
-{
-       return &dev->queue[RX_QUEUE_IDX] == queue;
-}
-
-static inline int is_tx_queue(struct virtio_dev *dev, struct virtio_queue *queue)
-{
-       return &dev->queue[TX_QUEUE_IDX] == queue;
-}
-
+/* The buffers passed through "req" from the virtio_net driver always
+ * starts with a vnet_hdr. We need to check the backend device if it
+ * expects vnet_hdr and adjust buffer offset accordingly.
+ */
 static int net_enqueue(struct virtio_dev *dev, struct virtio_req *req)
 {
 	struct lkl_virtio_net_hdr_v1 *header;
 	struct virtio_net_dev *net_dev;
-	int ret;
-	struct lkl_dev_buf iov[1];
+	int ret, len, i;
+	struct lkl_dev_buf *iov;
 
 	header = req->buf[0].addr;
 	net_dev = netdev_of(dev);
-	iov[0].len = req->buf[0].len - sizeof(*header);
-
-	iov[0].addr = &header[1];
-
-	if (!iov[0].len && req->buf_count > 1) {
-		iov[0].addr = req->buf[1].addr;
-		iov[0].len = req->buf[1].len;
+	if (!net_dev->nd->has_vnet_hdr) {
+		/* The backend device does not expect a vnet_hdr so adjust
+		 * buf accordingly. (We make adjustment to req->buf so it
+		 * can be used directly for the tx/rx call but remember to
+		 * undo the change after the call.
+		 * Note that it's ok to pass iov with entry's len==0.
+		 * The caller will skip to the next entry correctly.
+		 */
+		req->buf[0].addr += sizeof(*header);
+		req->buf[0].len -= sizeof(*header);
 	}
+	iov = req->buf;
 
 	/* Pick which virtqueue to send the buffer(s) to */
 	if (is_tx_queue(dev, req->q)) {
-		ret = net_dev->ops->tx(net_dev->nd, iov, 1);
+		ret = net_dev->ops->tx(net_dev->nd, iov, req->buf_count);
 		if (ret < 0)
 			return -1;
+		i = 1;
 	} else if (is_rx_queue(dev, req->q)) {
-		header->num_buffers = 1;
-		ret = net_dev->ops->rx(net_dev->nd, iov, 1);
+		ret = net_dev->ops->rx(net_dev->nd, iov, req->buf_count);
 		if (ret < 0)
 			return -1;
+		if (net_dev->nd->has_vnet_hdr) {
+
+			/* if the number of bytes returned exactly matches
+			 * the total space in the iov then there is a good
+			 * chance we did not supply a large enough buffer for
+			 * the whole pkt, i.e., pkt has been truncated.
+			 * This is only likely to happen under mergeable RX
+			 * buffer mode.
+			 */
+			if (req->mergeable_rx_len == (unsigned int)ret)
+				lkl_printf("PKT is likely truncated! len=%d\n",
+				    ret);
+		} else {
+			header->flags = 0;
+			header->gso_type = LKL_VIRTIO_NET_HDR_GSO_NONE;
+		}
+		/* Have to compute how many descriptors we've consumed (really
+		 * only matters to the the mergeable RX mode) and return it
+		 * through "num_buffers".
+		 */
+		for (i = 0, len = ret; len > 0; i++)
+			len -= req->buf[i].len;
+		req->buf_count = header->num_buffers = i;
+		/* Need to set "buf_count" to how many we really used in
+		 * order for virtio_req_complete() to work.
+		 */
+		if (dev->device_features & BIT(LKL_VIRTIO_NET_F_GUEST_CSUM))
+			header->flags = LKL_VIRTIO_NET_HDR_F_DATA_VALID;
 	} else {
 		bad_request("tried to push on non-existent queue");
 		return -1;
 	}
-
-	virtio_req_complete(req, iov[0].len + sizeof(*header));
-	return 0;
+	if (!net_dev->nd->has_vnet_hdr) {
+		/* Undo the adjustment */
+		req->buf[0].addr -= sizeof(*header);
+		req->buf[0].len += sizeof(*header);
+		ret += sizeof(struct lkl_virtio_net_hdr_v1);
+	}
+	virtio_req_complete(req, ret);
+	return i;
 }
 
 static struct virtio_dev_ops net_ops = {

diff --git a/tools/lkl/lib/virtio_net_linux_fdnet.c b/tools/lkl/lib/virtio_net_linux_fdnet.c
@@ -32,6 +32,12 @@ struct lkl_netdev_linux_fdnet_ops lkl_netdev_linux_fdnet_ops = {
 	#endif /* __NR_eventfd */
 };
 
+/* The following tx() and rx() code assume struct lkl_dev_buf matches
+ * sruct iovec so we can safely cast iov to (struct iovec *). (If
+ * BUILD_BUG_ON() were supported in LKL, I would have added
+ *
+ * "BUILD_BUG_ON(sizeof(struct lkl_dev_buf) == sizeof(struct iovec));"
+ */
 static int linux_fdnet_net_tx(struct lkl_netdev *nd,
 			      struct lkl_dev_buf *iov, int cnt)
 {
@@ -61,7 +67,6 @@ static int linux_fdnet_net_rx(struct lkl_netdev *nd,
 
 	if (ret < 0 && errno != EAGAIN)
 		perror("read from fdnet device fails");
-
 	return ret;
 }