Cilium data path analyse(VxLAN & Host Routing with Legacy)
环境准备
Node
root@node1:~# kubectl get nodes -owide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
node1 Ready control-plane 13d v1.24.6 192.168.64.8 <none> Ubuntu 22.04.2 LTS 5.15.0-75-generic containerd://1.6.8
node2 Ready control-plane 13d v1.24.6 192.168.64.9 <none> Ubuntu 22.04.2 LTS 5.15.0-75-generic containerd://1.6.8
node3 Ready <none> 13d v1.24.6 192.168.64.10 <none> Ubuntu 22.04.2 LTS 5.15.0-75-generic containerd://1.6.8
Host Routing Mode
root@node2:/home/cilium# cilium status
KVStore: Ok etcd: 3/3 connected, lease-ID=797e8963af868a82, lock lease-ID=3f888935026348b5, has-quorum=true: https://192.168.64.10:2379 - 3.5.4; https://192.168.64.8:2379 - 3.5.4 (Leader); https://192.168.64.9:2379 - 3.5.4
Kubernetes: Ok 1.24 (v1.24.6) [linux/amd64]
Kubernetes APIs: ["cilium/v2::CiliumClusterwideNetworkPolicy", "cilium/v2::CiliumNetworkPolicy", "core/v1::Namespace", "core/v1::Node", "core/v1::Pods", "core/v1::Service", "discovery/v1::EndpointSlice", "networking.k8s.io/v1::NetworkPolicy"]
KubeProxyReplacement: Probe [enp0s2 192.168.64.9]
Host firewall: Disabled
CNI Chaining: none
Cilium: Ok 1.12.1 (v1.12.1-4c9a630)
NodeMonitor: Disabled
IPAM: IPv4: 5/254 allocated from 10.233.65.0/24,
BandwidthManager: Disabled
Host Routing: Legacy # 这里模式改成Legacy
Masquerading: IPTables [IPv4: Enabled, IPv6: Disabled]
Controller Status: 39/39 healthy
Proxy Status: OK, ip 10.233.65.227, 0 redirects active on ports 10000-20000
Global Identity Range: min 256, max 65535
Hubble: Disabled
Encryption: Disabled
Cluster health: Warning cilium-health daemon unreachable
Data Path Analyse
Same Node
root@node1:~# kubectl get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
busybox-5587dd9dcc-8x25g 1/1 Running 2 (2d2h ago) 5d20h 10.233.64.174 node1 <none> <none>
busybox-5587dd9dcc-glk9p 1/1 Running 2 (2d2h ago) 5d20h 10.233.66.244 node3 <none> <none>
busybox-5587dd9dcc-mnn8h 1/1 Running 2 (2d2h ago) 5d20h 10.233.65.192 node2 <none> <none>
nginx-deployment-669895d846-bn9v7 1/1 Running 0 9d 10.233.65.7 node2 <none> <none>
实验目的是在node2上的busybox中ping node2上的nginx服务并分析数据流向
# node2 上的 busybox pod
root@node2:~# crictl ps | grep busybox
2f538b17b682f a416a98b71e22 2 days ago Running busybox 2 264de6e9b5dcc busybox-5587dd9dcc-mnn8h
# 获取busybox的网络命名空间
root@node2:~# crictl inspect 2f538b17b682f | grep proc
"/proc/acpi",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/proc/scsi",
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
"process": {
"destination": "/proc",
"type": "proc",
"source": "proc",
"path": "/proc/448037/ns/ipc"
"path": "/proc/448037/ns/uts"
"path": "/proc/448037/ns/net"
"/proc/acpi",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/proc/scsi",
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
# 进入busybox的网络命名空间
root@node2:~# nsenter -n -t 448037 -u bash
# 查看busybox的网卡信息和root命名空间一端的虚拟网卡ifindex
root@busybox-5587dd9dcc-mnn8h:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
30: eth0@if31: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether de:92:74:30:23:ff brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 10.233.65.192/32 scope global eth0
valid_lft forever preferred_lft forever
inet6 fe80::dc92:74ff:fe30:23ff/64 scope link
valid_lft forever preferred_lft forever
# 查看路由信息
root@busybox-5587dd9dcc-mnn8h:~# ip route
default via 10.233.65.227 dev eth0 mtu 1450
10.233.65.227 dev eth0 scope link
# 查看eth0 veth pair的lxc网卡信息
root@busybox-5587dd9dcc-mnn8h:~# ethtool -S eth0
NIC statistics:
peer_ifindex: 31
rx_queue_0_xdp_packets: 0
rx_queue_0_xdp_bytes: 0
rx_queue_0_drops: 0
rx_queue_0_xdp_redirect: 0
rx_queue_0_xdp_drops: 0
rx_queue_0_xdp_tx: 0
rx_queue_0_xdp_tx_errors: 0
tx_queue_0_xdp_xmit: 0
tx_queue_0_xdp_xmit_errors: 0
# root命名空间一端busybox的虚拟网卡
root@node2:~# ip a | grep 31
31: lxccea7daed89b0@if30: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 1a:a8:3d:a8:c3:e9 brd ff:ff:ff:ff:ff:ff link-netns cni-82033108-1cad-7e52-eb96-914074d3efa6
# 查看改网卡上ingress方向的tc程序
root@node2:~# tc filter show dev lxccea7daed89b0 ingress
filter protocol all pref 1 bpf chain 0
filter protocol all pref 1 bpf chain 0 handle 0x1 bpf_lxc.o:[from-container] direct-action not_in_hw id 31686 tag 2678facd09e74363 jited
# 添加iptables trace 规则用于观察数据包经过iptables的处理
root@node2:~# iptables -t raw -I PREROUTING -p icmp -j TRACE
# 然后就可以观察日志
root@node2:~# tail -f /var/log/message
# 删除上面的iptables规则
root@node2:~# iptables -t raw -D PREROUTING 1
# 抓包经过lxccea7daed89b0的流量数据
root@node2:~# tcpdump -pne -i lxccea7daed89b0
# 抓包经过eth0的流量数据
root@busybox-5587dd9dcc-mnn8h:~# tcpdump -pne -i eth0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
# 查看node2上cilium-agent管理的端点信息,其中id为3508是node2上busybox的端点信息
root@node2:/home/cilium# cilium bpf endpoint list
IP ADDRESS LOCAL ENDPOINT INFO
192.168.64.9:0 (localhost)
10.233.65.7:0 id=226 flags=0x0000 ifindex=29 mac=2A:F9:C2:40:CB:5A nodemac=DA:37:5B:AC:21:F2
10.233.65.227:0 (localhost)
10.233.65.41:0 id=2246 flags=0x0000 ifindex=15 mac=06:79:BE:64:23:E5 nodemac=96:AF:F0:92:98:F4
10.233.65.234:0 id=1496 flags=0x0000 ifindex=65 mac=CE:C7:0A:5C:5C:AF nodemac=3E:CD:6D:44:2D:FE
10.233.65.192:0 id=3508 flags=0x0000 ifindex=31 mac=DE:92:74:30:23:FF nodemac=1A:A8:3D:A8:C3:E9
root@node2:/home/cilium# cilium monitor --related-to 3508 -vv
Press Ctrl-C to quit
# 进入nginx中获取root命名空间一端的网卡ifindex
root@node2:~# kubectl exec -it nginx-deployment-669895d846-bn9v7 -- sh
/ # ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
28: eth0@if29: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1500 qdisc noqueue state UP qlen 1000
link/ether 2a:f9:c2:40:cb:5a brd ff:ff:ff:ff:ff:ff
inet 10.233.65.7/32 scope global eth0
valid_lft forever preferred_lft forever
inet6 fe80::28f9:c2ff:fe40:cb5a/64 scope link
valid_lft forever preferred_lft forever
/ # exit
# 获取root命名空间一端nginx的虚拟网卡
root@node2:~# ip a | grep 29
29: lxc89fe98f4c9ed@if28: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
root@node2:~# tcpdump -pne -i lxc89fe98f4c9ed
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxc89fe98f4c9ed, link-type EN10MB (Ethernet), snapshot length 262144 bytes
node2上的busybox中ping node2上的nginx服务
root@busybox-5587dd9dcc-mnn8h:~# ping 10.233.65.7 -c 1
PING 10.233.65.7 (10.233.65.7) 56(84) bytes of data.
64 bytes from 10.233.65.7: icmp_seq=1 ttl=63 time=4.12 ms
--- 10.233.65.7 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 4.120/4.120/4.120/0.000 ms
抓包点1-busybox的eth0
root@busybox-5587dd9dcc-mnn8h:~# tcpdump -pne -i eth0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
11:34:00.009616 de:92:74:30:23:ff > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.192, length 28
11:34:00.009663 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at 1a:a8:3d:a8:c3:e9, length 28
11:34:00.009684 de:92:74:30:23:ff > 1a:a8:3d:a8:c3:e9, ethertype IPv4 (0x0800), length 98: 10.233.65.192 > 10.233.65.7: ICMP echo request, id 33753, seq 1, length 64
11:34:00.012892 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.65.192: ICMP echo reply, id 33753, seq 1, length 64
抓包点2-busybox的root命名空间一端
root@node2:~# tcpdump -pne -i lxccea7daed89b0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxccea7daed89b0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
11:34:00.009630 de:92:74:30:23:ff > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.192, length 28
11:34:00.009650 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at 1a:a8:3d:a8:c3:e9, length 28
11:34:00.009694 de:92:74:30:23:ff > 1a:a8:3d:a8:c3:e9, ethertype IPv4 (0x0800), length 98: 10.233.65.192 > 10.233.65.7: ICMP echo request, id 33753, seq 1, length 64
11:34:00.012884 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.65.192: ICMP echo reply, id 33753, seq 1, length 64
抓包点3-nginx的root命名空间一端
root@node2:~# tcpdump -pne -i lxc89fe98f4c9ed
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxc89fe98f4c9ed, link-type EN10MB (Ethernet), snapshot length 262144 bytes
11:34:00.010447 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype IPv4 (0x0800), length 98: 10.233.65.192 > 10.233.65.7: ICMP echo request, id 33753, seq 1, length 64
11:34:00.012797 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.65.192: ICMP echo reply, id 33753, seq 1, length 64
11:34:05.149572 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.7, length 28
11:34:05.149882 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at da:37:5b:ac:21:f2, length 28
抓包点4-node2上cilium-agent中busybox的endpoint
root@node2:/home/cilium# cilium monitor --related-to 3508 -vv
Press Ctrl-C to quit
------------------------------------------------------------------------------
level=info msg="Initializing dissection cache..." subsys=monitor
Ethernet {Contents=[..14..] Payload=[..86..] SrcMAC=1a:a8:3d:a8:c3:e9 DstMAC=de:92:74:30:23:ff EthernetType=IPv4 Length=0}
IPv4 {Contents=[..20..] Payload=[..64..] Version=4 IHL=5 TOS=0 Length=84 Id=28212 Flags= FragOffset=0 TTL=63 Protocol=ICMPv4 Checksum=29916 SrcIP=10.233.65.7 DstIP=10.233.65.192 Options=[] Padding=[]}
ICMPv4 {Contents=[..8..] Payload=[..56..] TypeCode=EchoReply Checksum=38269 Id=33753 Seq=1}
Failed to decode layer: No decoder for layer type Payload
CPU 01: MARK 0x0 FROM 3508 to-endpoint: 98 bytes (98 captured), state reply, interface lxccea7daed89b0, , identity 5957->31615, orig-ip 10.233.65.7, to endpoint 3508
抓包点5-iptables规则链
root@node2:~# tail -f /var/log/message
通过上面的抓包可以看到,busybox的eth0和root命名空间一端的lxc网卡抓到的包是一样的,但是ipatables规则链中没有产生日志,因此可以说明数据包没有经过HOST NS的iptables规则链,再结合lxc网卡加载的[from-container] HOOK,这里的使用由redirect的能力。我们可以知道pod 发出流量后,流量会发给本 Pod 宿主机 lxc 接口的 tc ingress hook 的 eBPF 程序处理,eBPF 最终会查找目的 Pod,确定位于同一个节点,直接通过 redirect 方法将流量直接越过 kernel stack,送给目的 Pod 的 lxc 口,最终将流量送给目的 Pod。
Different Node
node1上的busybox中ping node2上的nginx服务
root@busybox-5587dd9dcc-mnn8h:~# ping 10.233.65.7 -c 1
PING 10.233.65.7 (10.233.65.7) 56(84) bytes of data.
64 bytes from 10.233.65.7: icmp_seq=1 ttl=63 time=4.12 ms
--- 10.233.65.7 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 4.120/4.120/4.120/0.000 ms
抓包点1-busybox的eth0
root@busybox-5587dd9dcc-8x25g:~# tcpdump -pne -i eth0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
14:28:47.446003 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype IPv4 (0x0800), length 98: 10.233.64.174 > 10.233.65.7: ICMP echo request, id 11395, seq 0, length 64
14:28:47.446742 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.64.174: ICMP echo reply, id 11395, seq 0, length 64
14:28:52.638498 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype ARP (0x0806), length 42: Request who-has 10.233.64.93 tell 10.233.64.174, length 28
14:28:52.638939 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype ARP (0x0806), length 42: Reply 10.233.64.93 is-at 32:12:64:c2:23:5b, length 28
抓包点2-busybox的root命名空间一端
root@node1:~# tcpdump -pne -i lxca58908f2f283
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxca58908f2f283, link-type EN10MB (Ethernet), snapshot length 262144 bytes
14:28:47.446011 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype IPv4 (0x0800), length 98: 10.233.64.174 > 10.233.65.7: ICMP echo request, id 11395, seq 0, length 64
14:28:47.446735 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.64.174: ICMP echo reply, id 11395, seq 0, length 64
14:28:52.638509 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype ARP (0x0806), length 42: Request who-has 10.233.64.93 tell 10.233.64.174, length 28
14:28:52.638928 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype ARP (0x0806), length 42: Reply 10.233.64.93 is-at 32:12:64:c2:23:5b, length 28
抓包点3-nginx的root命名空间一端
root@node2:~# tcpdump -pne -i lxc89fe98f4c9ed
14:28:47.459487 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype IPv4 (0x0800), length 98: 10.233.64.174 > 10.233.65.7: ICMP echo request, id 11395, seq 0, length 64
14:28:47.459514 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.64.174: ICMP echo reply, id 11395, seq 0, length 64
14:28:52.702616 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.7, length 28
14:28:52.703103 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at da:37:5b:ac:21:f2, length 28
抓包点4-node2上cilium-agent中busybox的endpoint
root@node1:/home/cilium# cilium monitor --related-to 443 -vv
Press Ctrl-C to quit
------------------------------------------------------------------------------
level=info msg="Initializing dissection cache..." subsys=monitor
Ethernet {Contents=[..14..] Payload=[..86..] SrcMAC=fe:91:05:b3:8a:de DstMAC=32:12:64:c2:23:5b EthernetType=IPv4 Length=0}
IPv4 {Contents=[..20..] Payload=[..64..] Version=4 IHL=5 TOS=0 Length=84 Id=48335 Flags=DF FragOffset=0 TTL=64 Protocol=ICMPv4 Checksum=58962 SrcIP=10.233.64.174 DstIP=10.233.65.7 Options=[] Padding=[]}
ICMPv4 {Contents=[..8..] Payload=[..56..] TypeCode=EchoRequest Checksum=58151 Id=11088 Seq=0}
Failed to decode layer: No decoder for layer type Payload
CPU 01: MARK 0x0 FROM 443 to-overlay: 98 bytes (98 captured), state new, interface cilium_vxlan, , identity 31615->unknown, orig-ip 0.0.0.0
------------------------------------------------------------------------------
Ethernet {Contents=[..14..] Payload=[..86..] SrcMAC=32:12:64:c2:23:5b DstMAC=fe:91:05:b3:8a:de EthernetType=IPv4 Length=0}
IPv4 {Contents=[..20..] Payload=[..64..] Version=4 IHL=5 TOS=0 Length=84 Id=47340 Flags= FragOffset=0 TTL=63 Protocol=ICMPv4 Checksum=11062 SrcIP=10.233.65.7 DstIP=10.233.64.174 Options=[] Padding=[]}
ICMPv4 {Contents=[..8..] Payload=[..56..] TypeCode=EchoReply Checksum=60199 Id=11088 Seq=0}
Failed to decode layer: No decoder for layer type Payload
CPU 01: MARK 0x0 FROM 443 to-endpoint: 98 bytes (98 captured), state reply, interface lxca58908f2f283, , identity 5957->31615, orig-ip 10.233.65.7, to endpoint 443
VxLAN设备 tc 程序
root@node1:~# tc filter show dev cilium_vxlan ingress
filter protocol all pref 1 bpf chain 0
filter protocol all pref 1 bpf chain 0 handle 0x1 bpf_overlay.o:[from-overlay] direct-action not_in_hw id 38054 tag 8dadd616a2c190d7 jited
root@node1:~# tc filter show dev cilium_vxlan egress
filter protocol all pref 1 bpf chain 0
filter protocol all pref 1 bpf chain 0 handle 0x1 bpf_overlay.o:[to-overlay] direct-action not_in_hw id 38063 tag 634ef5728fd44f7a jited
通过上面的抓包看一看到不同节点pod相互访问时走的vxlan隧道,因为在node1的cilium-agent监控可以看到不同于同节点的是多了一条to-overlay的请求数据包,并且经过vxlan封装的数据包此时会被HOST NS中的iptables处理。