Skip to content

Commit

Permalink
Merge branch 'udp-gso'
Browse files Browse the repository at this point in the history
Willem de Bruijn says:

====================
udp gso

Segmentation offload reduces cycles/byte for large packets by
amortizing the cost of protocol stack traversal.

This patchset implements GSO for UDP. A process can concatenate and
submit multiple datagrams to the same destination in one send call
by setting socket option SOL_UDP/UDP_SEGMENT with the segment size,
or passing an analogous cmsg at send time.

The stack will send the entire large (up to network layer max size)
datagram through the protocol layer. At the GSO layer, it is broken
up in individual segments. All receive the same network layer header
and UDP src and dst port. All but the last segment have the same UDP
header, but the last may differ in length and checksum.

Initial results show a significant reduction in UDP cycles/byte.
See the main patch for more details and benchmark results.

        udp
          876 MB/s 14873 msg/s 624666 calls/s
            11,205,777,429      cycles

        udp gso
         2139 MB/s 36282 msg/s 36282 calls/s
            11,204,374,561      cycles

The patch set is broken down as follows:
- patch 1 is a prerequisite: code rearrangement, noop otherwise
- patch 2 implements the gso logic
- patch 3 adds protocol stack support for UDP_SEGMENT
- patch 4,5,7 are refinements
- patch 6 adds the cmsg interface
- patch 8..11 are tests

This idea was presented previously at netconf 2017-2
http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf

Changes v1 -> v2
  - Convert __udp_gso_segment to modify headers after skb_segment
  - Split main patch into two, one for gso logic, one for UDP_SEGMENT

Changes RFC -> v1
  - MSG_MORE:
      fixed, by allowing checksum offload with corking if gso
  - SKB_GSO_UDP_L4:
      made independent from SKB_GSO_UDP
      and removed skb_is_ufo() wrapper
  - NETIF_F_GSO_UDP_L4:
      add to netdev_features_string
      and to netdev-features.txt
      add BUILD_BUG_ON to match SKB_GSO_UDP_L4 value
  - UDP_MAX_SEGMENTS:
      introduce limit on number of segments per gso skb
      to avoid extreme cases like IP_MAX_MTU/IPV4_MIN_MTU
  - CHECKSUM_PARTIAL:
      test against missing feature after ndo_features_check
      if not supported return error, analogous to udp_send_check
  - MSG_ZEROCOPY: removed, deferred for now
====================

Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
davem330 committed Apr 26, 2018
2 parents a9537c9 + 3a687be commit cb586c6
Show file tree
Hide file tree
Showing 26 changed files with 1,687 additions and 51 deletions.
7 changes: 7 additions & 0 deletions Documentation/networking/netdev-features.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ whatever headers there might be.
NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit
set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6).

* Transmit UDP segmentation offload

NETIF_F_GSO_UDP_GSO_L4 accepts a single UDP header with a payload that exceeds
gso_size. On segmentation, it segments the payload on gso_size boundaries and
replicates the network and UDP headers (fixing up the last one if less than
gso_size).

* Transmit DMA from high memory

On platforms where this is relevant, NETIF_F_HIGHDMA signals that
Expand Down
5 changes: 4 additions & 1 deletion include/linux/netdev_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ enum {
NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */
NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */
NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */
NETIF_F_GSO_UDP_L4_BIT, /* ... UDP payload GSO (not UFO) */
/**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */
NETIF_F_GSO_UDP_BIT,
NETIF_F_GSO_UDP_L4_BIT,

NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */
NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */
Expand Down Expand Up @@ -147,6 +148,7 @@ enum {
#define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM)
#define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT)
#define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD)
#define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4)

#define for_each_netdev_feature(mask_addr, bit) \
for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
Expand Down Expand Up @@ -216,6 +218,7 @@ enum {
NETIF_F_GSO_GRE_CSUM | \
NETIF_F_GSO_IPXIP4 | \
NETIF_F_GSO_IPXIP6 | \
NETIF_F_GSO_UDP_L4 | \
NETIF_F_GSO_UDP_TUNNEL | \
NETIF_F_GSO_UDP_TUNNEL_CSUM)

Expand Down
1 change: 1 addition & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -4186,6 +4186,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));

return (features & feature) == feature;
}
Expand Down
2 changes: 2 additions & 0 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,8 @@ enum {
SKB_GSO_ESP = 1 << 15,

SKB_GSO_UDP = 1 << 16,

SKB_GSO_UDP_L4 = 1 << 17,
};

#if BITS_PER_LONG > 32
Expand Down
3 changes: 3 additions & 0 deletions include/linux/udp.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ struct udp_sock {
* when the socket is uncorked.
*/
__u16 len; /* total length of pending frames */
__u16 gso_size;
/*
* Fields specific to UDP-Lite.
*/
Expand Down Expand Up @@ -87,6 +88,8 @@ struct udp_sock {
int forward_deficit;
};

#define UDP_MAX_SEGMENTS (1 << 6UL)

static inline struct udp_sock *udp_sk(const struct sock *sk)
{
return (struct udp_sock *)sk;
Expand Down
1 change: 1 addition & 0 deletions include/net/inet_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ struct inet_cork {
__u8 ttl;
__s16 tos;
char priority;
__u16 gso_size;
};

struct inet_cork_full {
Expand Down
3 changes: 2 additions & 1 deletion include/net/ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ struct ipcm_cookie {
__u8 ttl;
__s16 tos;
char priority;
__u16 gso_size;
};

#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
Expand Down Expand Up @@ -171,7 +172,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags);
struct inet_cork *cork, unsigned int flags);

static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
{
Expand Down
2 changes: 2 additions & 0 deletions include/net/ipv6.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ struct ipcm6_cookie {
__s16 tclass;
__s8 dontfrag;
struct ipv6_txoptions *opt;
__u16 gso_size;
};

static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
Expand Down Expand Up @@ -950,6 +951,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
struct inet_cork_full *cork,
const struct sockcm_cookie *sockc);

static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
Expand Down
5 changes: 5 additions & 0 deletions include/net/udp.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup);
int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);

struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features,
unsigned int mss, __sum16 check);

static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
{
struct udphdr *uh;
Expand Down Expand Up @@ -269,6 +273,7 @@ int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/udp.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ struct udphdr {
#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
#define UDP_SEGMENT 103 /* Set GSO segmentation size */

/* UDP encapsulation types */
#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
Expand Down
1 change: 1 addition & 0 deletions net/core/ethtool.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
[NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
[NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",

[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
Expand Down
2 changes: 2 additions & 0 deletions net/core/skbuff.c
Original file line number Diff line number Diff line change
Expand Up @@ -4940,6 +4940,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen = tcp_hdrlen(skb);
} else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
thlen = sizeof(struct udphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
Expand Down
41 changes: 25 additions & 16 deletions net/ipv4/ip_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -878,11 +878,14 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
bool paged;

skb = skb_peek_tail(queue);

exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->fragsize;
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
paged = !!cork->gso_size;

if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
Expand All @@ -906,7 +909,7 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
!(flags & MSG_MORE) &&
(!(flags & MSG_MORE) || cork->gso_size) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;

Expand All @@ -933,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
unsigned int pagedlen = 0;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
Expand All @@ -953,8 +957,12 @@ static int __ip_append_data(struct sock *sk,
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
else if (!paged)
alloclen = fraglen;
else {
alloclen = min_t(int, fraglen, MAX_HEADER);
pagedlen = fraglen - alloclen;
}

alloclen += exthdrlen;

Expand Down Expand Up @@ -998,7 +1006,7 @@ static int __ip_append_data(struct sock *sk,
/*
* Find where to start putting bytes.
*/
data = skb_put(skb, fraglen + exthdrlen);
data = skb_put(skb, fraglen + exthdrlen - pagedlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
Expand All @@ -1014,15 +1022,15 @@ static int __ip_append_data(struct sock *sk,
pskb_trim_unique(skb_prev, maxfraglen);
}

copy = datalen - transhdrlen - fraggap;
copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}

offset += copy;
length -= datalen - fraggap;
length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
Expand Down Expand Up @@ -1135,6 +1143,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
*rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;

cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
Expand Down Expand Up @@ -1214,7 +1224,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
return -EOPNOTSUPP;

hh_len = LL_RESERVED_SPACE(rt->dst.dev);
mtu = cork->fragsize;
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;

fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
Expand Down Expand Up @@ -1470,9 +1480,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags)
struct inet_cork *cork, unsigned int flags)
{
struct inet_cork cork;
struct sk_buff_head queue;
int err;

Expand All @@ -1481,22 +1490,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,

__skb_queue_head_init(&queue);

cork.flags = 0;
cork.addr = 0;
cork.opt = NULL;
err = ip_setup_cork(sk, &cork, ipc, rtp);
cork->flags = 0;
cork->addr = 0;
cork->opt = NULL;
err = ip_setup_cork(sk, cork, ipc, rtp);
if (err)
return ERR_PTR(err);

err = __ip_append_data(sk, fl4, &queue, &cork,
err = __ip_append_data(sk, fl4, &queue, cork,
&current->task_frag, getfrag,
from, length, transhdrlen, flags);
if (err) {
__ip_flush_pending_frames(sk, &queue, &cork);
__ip_flush_pending_frames(sk, &queue, cork);
return ERR_PTR(err);
}

return __ip_make_skb(sk, fl4, &queue, &cork);
return __ip_make_skb(sk, fl4, &queue, cork);
}

/*
Expand Down
Loading

0 comments on commit cb586c6

Please sign in to comment.