From e16801f90f95257aa55587a0c7a5d3ad4a9d7e60 Mon Sep 17 00:00:00 2001 From: Andrew Stoycos Date: Wed, 2 Nov 2022 10:24:06 -0400 Subject: [PATCH 1/2] Try and Fix XDP redirect in KIND Try and ensure XDP redirect will work in KIND This Updates the XDP program to rewrite: - Src Mac address - Dst Port address and allows us to configure if we want to run cksum calculation on a per backend basis. It also fixes the existing CKSUM calculation function to ignore trailing bits. Lastly We should be using the autogenerated bpf2go map structs from `bpf_*.go` Signed-off-by: Andrew Stoycos --- dataplane/README.md | 191 ++++++++++++++++++++++++++++++++++++++++++++ dataplane/main.go | 31 ++++--- dataplane/xdp.c | 87 +++++++++++++++----- 3 files changed, 275 insertions(+), 34 deletions(-) create mode 100644 dataplane/README.md diff --git a/dataplane/README.md b/dataplane/README.md new file mode 100644 index 00000000..6ad3fd21 --- /dev/null +++ b/dataplane/README.md @@ -0,0 +1,191 @@ +# Some helpful hints for debugging this XDP program + +## Tracing XDP redirect (on first interface where main XDP program is attached) + +(TODO finish tracing the XDP path through the kernel) +1. Entry at `xdp_do_redirect` + - Frags Don't work `xdp_buff_has_frags` + - If map == XSKMAP -> `__xdp_do_redirect_xsk` + - Returns `__xdp_do_redirect_frame` + +2. Entry `__xdp_do_redirect_frame` (Can't trace internal functions?) + + +## Tracing Once packet meets host end of veth + +(TODO finish tracing the XDP path through the kernel) +__netif_receive_skb_core + + +## Debugging UDP Checksum issues + +We can use TCP dump see if cksum's are correct once the packets reach the container: + +```bash +sudo tcpdump -vvv -i -neep udp` +``` + +`__sum16 __skb_checksum_complete(struct sk_buff *skb)` is the name of the kernel +function which will actually check the cksum, it can be tracked with `bpftrace` +and the following kprobe: + +```bash +kretprobe:__skb_checksum_complete +{ + printf("skb_checksum_complete returned: %x\n", retval); +} +``` + +## Manually Calculating UDP Checksums + +A UDP cksum is calculated with the following: + +```bash +1's Complement { + Source IP + + Destination IP + + 17 (0x0011 - UDP protocol code) + + UDP Packet Length + Source Port + + Destination Port + + UDP Packet Length + + Data +} +``` + +A Raw TCPdump packet is shown below: +```bash +13:23:15.756911 06:56:87:ec:fd:1f > 86:ad:33:29:ff:5e, ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 57, id 20891, offset 0, flags [DF], proto UDP (17), length 33) + 10.8.125.12.58980 > 192.168.10.2.sapv1: [bad udp cksum 0xd301 -> 0xaf43!] UDP, length 5 + 0x0000: 86ad 3329 ff5e 0656 87ec fd1f 0800 4500 + 0x0010: 0021 519b 4000 3911 9e72 0a08 7d0c c0a8 + 0x0020: 0a02 e664 2693 000d d301 7465 7374 0a00 + 0x0030: 0000 0000 d2f2 935d 0000 0000 +``` + +Using this along with our knowledge of a UDP packet we can quickly and manually +calculate the cksum like so: + +```bash +0x0a08 Src IP octet 1 +0x7d0c Src IP octet 2 +0xc0a8 Dst IP octet 1 +0x0a02 Dst IP octet 2 +0x0011 Proto +0x000d Length +0xe664 Src Port +0x2693 Dst Port +0x000d Length +0x7465 Data +0x7374 Data +0x0a00 Data ++ +------------- +50bc -> 1's compliment = af43 +``` + +To play with this same raw data in wireshark we can use the text from the hex dump +and convert it to the following format. With this in a file you can then +"Import from hex dump" in wireshark. + +```bash +13:23:15 +0000 86 ad 33 29 ff 5e 06 56 87 ec fd 1f 08 00 45 00 +0010 00 21 51 9b 40 00 39 11 9e 72 0a 08 7d 0c c0 a8 +0020 0a 02 e6 64 26 93 00 0d d3 01 74 65 73 74 0a 00 +0030 00 00 00 00 d2 f2 93 5d 00 00 00 00 +``` + +![Above Raw packet shown in wireshark](./wireshark.png) + +## Tracing Non XDP stack (native kernel) with PWRU + +Cilium's [PWRU](https://github.com/cilium/pwru) is a great tool for tracing packets +as they make their way through the linux kernel. It is limited in the fact that it +doesn't really track the XDP stack currently, however it's still super helpful +for debugging other issues. + +### Working Trace (manually re-writing Cksums) + +```bash +0xffff96d3956d4f00 8 [ksoftirqd/8] udp4_gro_receive +0xffff96d3956d4f00 8 [ksoftirqd/8] udp_gro_receive +0xffff96d3956d4f00 8 [ksoftirqd/8] skb_defer_rx_timestamp +0xffff96d3956d4f00 8 [ksoftirqd/8] tpacket_rcv +0xffff96d3956d4f00 8 [ksoftirqd/8] skb_push +0xffff96d3956d4f00 8 [ksoftirqd/8] tpacket_get_timestamp +0xffff96d3956d4f00 8 [ksoftirqd/8] consume_skb +0xffff96d3956d4f00 10 [nc] skb_consume_udp +0xffff96d3956d4f00 10 [nc] skb_consume_udp +0xffff96d3956d4f00 10 [nc] __consume_stateless_skb +0xffff96d3956d4f00 10 [nc] skb_release_data +0xffff96d3956d4f00 10 [nc] skb_free_head +0xffff96d3956d4f00 10 [nc] kfree_skbmem +0xffff96d3956d4f00 8 [ksoftirqd/8] ip_rcv_core +0xffff96d3956d4f00 8 [ksoftirqd/8] pskb_trim_rcsum_slow +0xffff96d3956d4f00 8 [ksoftirqd/8] udp_v4_early_demux +0xffff96d3956d4f00 8 [ksoftirqd/8] ip_route_input_noref +0xffff96d3956d4f00 8 [ksoftirqd/8] ip_route_input_rcu +0xffff96d3956d4f00 8 [ksoftirqd/8] ip_route_input_slow +0xffff96d3956d4f00 8 [ksoftirqd/8] fib_validate_source +0xffff96d3956d4f00 8 [ksoftirqd/8] __fib_validate_source +0xffff96d3956d4f00 8 [ksoftirqd/8] ip_local_deliver +0xffff96d3956d4f00 8 [ksoftirqd/8] ip_local_deliver_finish +0xffff96d3956d4f00 8 [ksoftirqd/8] ip_protocol_deliver_rcu +0xffff96d3956d4f00 8 [ksoftirqd/8] raw_local_deliver +0xffff96d3956d4f00 8 [ksoftirqd/8] udp_rcv +0xffff96d3956d4f00 8 [ksoftirqd/8] __udp4_lib_rcv +0xffff96d3956d4f00 8 [ksoftirqd/8] __skb_checksum_complete +0xffff96d3956d4f00 8 [ksoftirqd/8] udp_unicast_rcv_skb +0xffff96d3956d4f00 8 [ksoftirqd/8] udp_queue_rcv_skb +0xffff96d3956d4f00 8 [ksoftirqd/8] udp_queue_rcv_one_skb +0xffff96d3956d4f00 8 [ksoftirqd/8] sk_filter_trim_cap +0xffff96d3956d4f00 8 [ksoftirqd/8] security_sock_rcv_skb +0xffff96d3956d4f00 8 [ksoftirqd/8] selinux_socket_sock_rcv_skb +0xffff96d3956d4f00 8 [ksoftirqd/8] selinux_sock_rcv_skb_compat +0xffff96d3956d4f00 8 [ksoftirqd/8] selinux_netlbl_sock_rcv_skb +0xffff96d3956d4f00 8 [ksoftirqd/8] selinux_xfrm_sock_rcv_skb +0xffff96d3956d4f00 8 [ksoftirqd/8] bpf_lsm_socket_sock_rcv_skb +``` + +### Working Trace (ignoring cksums i.e setting to 0) + +```bash +0xffff96d35c18f000 8 [] udp4_gro_receive +0xffff96d35c18f000 8 [] udp_gro_receive +0xffff96d35c18f000 8 [] skb_defer_rx_timestamp +0xffff96d35c18f000 8 [] tpacket_rcv +0xffff96d35c18f000 8 [] skb_push +0xffff96d35c18f000 8 [] tpacket_get_timestamp +0xffff96d35c18f000 10 [nc] skb_consume_udp +0xffff96d35c18f000 10 [nc] skb_consume_udp +0xffff96d35c18f000 10 [nc] __consume_stateless_skb +0xffff96d35c18f000 10 [nc] skb_release_data +0xffff96d35c18f000 10 [nc] skb_free_head +0xffff96d35c18f000 10 [nc] kfree_skbmem +0xffff96d35c18f000 8 [] consume_skb +0xffff96d35c18f000 8 [] ip_rcv_core +0xffff96d35c18f000 8 [] pskb_trim_rcsum_slow +0xffff96d35c18f000 8 [] udp_v4_early_demux +0xffff96d35c18f000 8 [] ip_route_input_noref +0xffff96d35c18f000 8 [] ip_route_input_rcu +0xffff96d35c18f000 8 [] ip_route_input_slow +0xffff96d35c18f000 8 [] fib_validate_source +0xffff96d35c18f000 8 [] __fib_validate_source +0xffff96d35c18f000 8 [] ip_local_deliver +0xffff96d35c18f000 8 [] ip_local_deliver_finish +0xffff96d35c18f000 8 [] ip_protocol_deliver_rcu +0xffff96d35c18f000 8 [] raw_local_deliver +0xffff96d35c18f000 8 [] udp_rcv +0xffff96d35c18f000 8 [] __udp4_lib_rcv # ----> No CKSUM so we don't call __skb_checksum_complete +0xffff96d35c18f000 8 [] udp_unicast_rcv_skbx_ +0xffff96d35c18f000 8 [] udp_queue_rcv_skb +0xffff96d35c18f000 8 [] udp_queue_rcv_one_skb +0xffff96d35c18f000 8 [] sk_filter_trim_cap +0xffff96d35c18f000 8 [] security_sock_rcv_skb +0xffff96d35c18f000 8 [] selinux_socket_sock_rcv_skb +0xffff96d35c18f000 8 [] selinux_sock_rcv_skb_compat +0xffff96d35c18f000 8 [] selinux_netlbl_sock_rcv_skb +0xffff96d35c18f000 8 [] selinux_xfrm_sock_rcv_skb +0xffff96d35c18f000 8 [] bpf_lsm_socket_sock_rcv_skb +0xffff96d35c18f000 8 [] skb_pull_rcsum +``` \ No newline at end of file diff --git a/dataplane/main.go b/dataplane/main.go index c8615fde..96ba25cf 100644 --- a/dataplane/main.go +++ b/dataplane/main.go @@ -58,14 +58,26 @@ func main() { log.Printf("Attached XDP program to iface %q (index %d)", iface.Name, iface.Index) log.Printf("Press Ctrl-C to exit and remove the program") - b := backend{ - saddr: ip2int("172.18.0.1"), - daddr: ip2int("10.244.0.6"), - hwaddr: hwaddr2bytes("9a:fb:6d:e6:a1:26"), - ifindex: 6, + // TODO(astoycos) Shouldn't be hardcoded + b := bpfBackend{ + Saddr: ip2int("10.8.125.12"), + Daddr: ip2int("192.168.10.2"), + Dport: 9875, + // Host-Side Veth Mac + Shwaddr: hwaddr2bytes("06:56:87:ec:fd:1f"), + // Container-Side Veth Mac + Dhwaddr: hwaddr2bytes("86:ad:33:29:ff:5e"), + Nocksum: 1, + Ifindex: 8, } - if err := objs.Backends.Update(ip2int("172.18.0.100"), b, ebpf.UpdateAny); err != nil { + // TODO(astoycos) Shouldn't be hardcoded + key := bpfVipKey{ + Vip: ip2int("10.8.125.12"), + Port: 8888, + } + + if err := objs.Backends.Update(key, b, ebpf.UpdateAny); err != nil { fmt.Println(err.Error()) os.Exit(1) } @@ -74,13 +86,6 @@ func main() { } } -type backend struct { - saddr uint32 - daddr uint32 - hwaddr [6]uint8 - ifindex uint16 -} - func ip2int(ip string) uint32 { ipaddr := net.ParseIP(ip) return binary.LittleEndian.Uint32(ipaddr.To4()) diff --git a/dataplane/xdp.c b/dataplane/xdp.c index a13c8934..9efcd3e9 100644 --- a/dataplane/xdp.c +++ b/dataplane/xdp.c @@ -18,6 +18,8 @@ char __license[] SEC("license") = "GPL"; #define MAX_BACKENDS 128 #define MAX_UDP_LENGTH 1480 +#define UDP_PAYLOAD_SIZE(x) (unsigned int)(((bpf_htons(x) - sizeof(struct udphdr)) * 8 ) / 4) + static __always_inline void ip_from_int(__u32 *buf, __be32 ip) { buf[0] = (ip >> 0 ) & 0xFF; buf[1] = (ip >> 8 ) & 0xFF; @@ -51,28 +53,42 @@ static __always_inline __u16 iph_csum(struct iphdr *iph) { static __always_inline __u16 udp_checksum(struct iphdr *ip, struct udphdr * udp, void * data_end) { udp->check = 0; - __u16 csum = 0; - __u16 *buf = (__u16*)udp; - - csum += ip->saddr; - csum += ip->saddr >> 16; - csum += ip->daddr; - csum += ip->daddr >> 16; - csum += (__u16)ip->protocol << 8; - csum += udp->len; + // So we can overflow a bit make this __u32 + __u32 csum_total = 0; + __u16 csum; + __u16 *buf = (void *)udp; + + csum_total += (__u16)ip->saddr; + csum_total += (__u16)(ip->saddr >> 16); + csum_total += (__u16)ip->daddr; + csum_total += (__u16)(ip->daddr >> 16); + csum_total += (__u16)(ip->protocol << 8); + csum_total += udp->len; + + // The number of nibbles in the UDP header + Payload + unsigned int udp_packet_nibbles = UDP_PAYLOAD_SIZE(udp->len); + + // Here we only want to iterate through payload + // NOT trailing bits + for (int i = 0; i <= MAX_UDP_LENGTH; i += 2) { + if (i > udp_packet_nibbles) { + break; + } - for (int i = 0; i < MAX_UDP_LENGTH; i += 2) { if ((void *)(buf + 1) > data_end) { break; } - csum += *buf; + csum_total += *buf; buf++; } if ((void *)buf + 1 <= data_end) { - csum += *(__u8 *)buf; + csum_total += (*(__u8 *)buf); } + // Add any cksum overflow back into __u16 + csum = (__u16)csum_total + (__u16)(csum_total >> 16); + csum = ~csum; return csum; } @@ -80,14 +96,27 @@ static __always_inline __u16 udp_checksum(struct iphdr *ip, struct udphdr * udp, struct backend { __u32 saddr; __u32 daddr; - __u8 hwaddr[6]; + __u16 dport; + __u8 shwaddr[6]; + __u8 dhwaddr[6]; __u16 ifindex; + // Cksum isn't required for UDP see: + // https://en.wikipedia.org/wiki/User_Datagram_Protocol + __u8 nocksum; + __u8 pad[3]; +}; + + +struct vip_key { + __u32 vip; + __u16 port; + __u8 pad[2]; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, MAX_BACKENDS); - __type(key, __u32); + __type(key, struct vip_key); __type(value, struct backend); } backends SEC(".maps"); @@ -132,12 +161,15 @@ int xdp_prog_func(struct xdp_md *ctx) { // Routing // --------------------------------------------------------------------------- - __u32 original_dest_ip = ip->daddr; + struct vip_key key = { + .vip = ip->daddr, + .port = bpf_ntohs(udp->dest) + }; struct backend *bk; - bk = bpf_map_lookup_elem(&backends, &original_dest_ip); + bk = bpf_map_lookup_elem(&backends, &key); if (!bk) { - bpf_printk("no backends for ip %x", original_dest_ip); + bpf_printk("no backends for ip %x:%x", key.vip, key.port); return XDP_PASS; } @@ -153,19 +185,32 @@ int xdp_prog_func(struct xdp_md *ctx) { bpf_printk_ip(ip->saddr); bpf_printk("updated daddr to:"); bpf_printk_ip(ip->daddr); + + if (udp->dest != bpf_ntohs(bk->dport)) { + udp->dest = bpf_ntohs(bk->dport); + bpf_printk("updated dport to: %d", bk->dport); + } - memcpy(eth->h_source, eth->h_dest, sizeof(eth->h_source)); + memcpy(eth->h_source, bk->shwaddr, sizeof(eth->h_source)); bpf_printk("new source hwaddr %x:%x:%x:%x:%x:%x", eth->h_source[0], eth->h_source[1], eth->h_source[2], eth->h_source[3], eth->h_source[4], eth->h_source[5]); - memcpy(eth->h_dest, bk->hwaddr, sizeof(eth->h_dest)); + memcpy(eth->h_dest, bk->dhwaddr, sizeof(eth->h_dest)); bpf_printk("new dest hwaddr %x:%x:%x:%x:%x:%x", eth->h_dest[0], eth->h_dest[1], eth->h_dest[2], eth->h_dest[3], eth->h_dest[4], eth->h_dest[5]); ip->check = iph_csum(ip); - udp->check = udp_checksum(ip, udp, data_end); + udp->check = 0; + + if (!bk->nocksum){ + udp->check = udp_checksum(ip, udp, data_end); + } bpf_printk("destination interface index %d", bk->ifindex); + + int action = bpf_redirect(bk->ifindex, 0); - return bpf_redirect(bk->ifindex, 0); + bpf_printk("redirect action: %d", action); + + return action; } SEC("xdp") From 2a306a69d4a128dffae8565123d80083d0955803 Mon Sep 17 00:00:00 2001 From: Andrew Stoycos Date: Wed, 2 Nov 2022 10:27:38 -0400 Subject: [PATCH 2/2] `go mod tidy` Signed-off-by: Andrew Stoycos --- go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index dc4a3b41..8cc790e6 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,10 @@ go 1.19 require ( github.com/kong/kubernetes-testing-framework v0.22.3 + k8s.io/api v0.25.2 k8s.io/apimachinery v0.25.2 k8s.io/client-go v0.25.2 + k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed sigs.k8s.io/controller-runtime v0.13.0 sigs.k8s.io/gateway-api v0.5.1 ) @@ -85,12 +87,10 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/api v0.25.2 // indirect k8s.io/apiextensions-apiserver v0.25.2 // indirect k8s.io/component-base v0.25.2 // indirect k8s.io/klog/v2 v2.70.1 // indirect k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect - k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect sigs.k8s.io/kind v0.16.0 // indirect sigs.k8s.io/kustomize/api v0.12.1 // indirect