Cilium data path analyse(VxLAN & Host Routing with Legacy)

环境准备

Node

root@node1:~# kubectl get nodes -owide
NAME    STATUS   ROLES           AGE   VERSION   INTERNAL-IP     EXTERNAL-IP   OS-IMAGE             KERNEL-VERSION      CONTAINER-RUNTIME
node1   Ready    control-plane   13d   v1.24.6   192.168.64.8    <none>        Ubuntu 22.04.2 LTS   5.15.0-75-generic   containerd://1.6.8
node2   Ready    control-plane   13d   v1.24.6   192.168.64.9    <none>        Ubuntu 22.04.2 LTS   5.15.0-75-generic   containerd://1.6.8
node3   Ready    <none>          13d   v1.24.6   192.168.64.10   <none>        Ubuntu 22.04.2 LTS   5.15.0-75-generic   containerd://1.6.8

Host Routing Mode

root@node2:/home/cilium# cilium status
KVStore:                 Ok   etcd: 3/3 connected, lease-ID=797e8963af868a82, lock lease-ID=3f888935026348b5, has-quorum=true: https://192.168.64.10:2379 - 3.5.4; https://192.168.64.8:2379 - 3.5.4 (Leader); https://192.168.64.9:2379 - 3.5.4
Kubernetes:              Ok   1.24 (v1.24.6) [linux/amd64]
Kubernetes APIs:         ["cilium/v2::CiliumClusterwideNetworkPolicy", "cilium/v2::CiliumNetworkPolicy", "core/v1::Namespace", "core/v1::Node", "core/v1::Pods", "core/v1::Service", "discovery/v1::EndpointSlice", "networking.k8s.io/v1::NetworkPolicy"]
KubeProxyReplacement:    Probe   [enp0s2 192.168.64.9]
Host firewall:           Disabled
CNI Chaining:            none
Cilium:                  Ok   1.12.1 (v1.12.1-4c9a630)
NodeMonitor:             Disabled
IPAM:                    IPv4: 5/254 allocated from 10.233.65.0/24, 
BandwidthManager:        Disabled
Host Routing:            Legacy # 这里模式改成Legacy
Masquerading:            IPTables [IPv4: Enabled, IPv6: Disabled]
Controller Status:       39/39 healthy
Proxy Status:            OK, ip 10.233.65.227, 0 redirects active on ports 10000-20000
Global Identity Range:   min 256, max 65535
Hubble:                  Disabled
Encryption:              Disabled
Cluster health:                Warning   cilium-health daemon unreachable

Data Path Analyse

Same Node

root@node1:~# kubectl get po -owide
NAME                                READY   STATUS    RESTARTS       AGE     IP              NODE    NOMINATED NODE   READINESS GATES
busybox-5587dd9dcc-8x25g            1/1     Running   2 (2d2h ago)   5d20h   10.233.64.174   node1   <none>           <none>
busybox-5587dd9dcc-glk9p            1/1     Running   2 (2d2h ago)   5d20h   10.233.66.244   node3   <none>           <none>
busybox-5587dd9dcc-mnn8h            1/1     Running   2 (2d2h ago)   5d20h   10.233.65.192   node2   <none>           <none>
nginx-deployment-669895d846-bn9v7   1/1     Running   0              9d      10.233.65.7     node2   <none>           <none>

实验目的是在node2上的busybox中ping node2上的nginx服务并分析数据流向

# node2 上的 busybox pod
root@node2:~# crictl ps | grep busybox
2f538b17b682f       a416a98b71e22       2 days ago          Running             busybox                   2                   264de6e9b5dcc       busybox-5587dd9dcc-mnn8h
# 获取busybox的网络命名空间
root@node2:~# crictl inspect 2f538b17b682f | grep proc 
            "/proc/acpi",
            "/proc/kcore",
            "/proc/keys",
            "/proc/latency_stats",
            "/proc/timer_list",
            "/proc/timer_stats",
            "/proc/sched_debug",
            "/proc/scsi",
            "/proc/asound",
            "/proc/bus",
            "/proc/fs",
            "/proc/irq",
            "/proc/sys",
            "/proc/sysrq-trigger"
      "process": {
          "destination": "/proc",
          "type": "proc",
          "source": "proc",
            "path": "/proc/448037/ns/ipc"
            "path": "/proc/448037/ns/uts"
            "path": "/proc/448037/ns/net"
          "/proc/acpi",
          "/proc/kcore",
          "/proc/keys",
          "/proc/latency_stats",
          "/proc/timer_list",
          "/proc/timer_stats",
          "/proc/sched_debug",
          "/proc/scsi",
          "/proc/asound",
          "/proc/bus",
          "/proc/fs",
          "/proc/irq",
          "/proc/sys",
          "/proc/sysrq-trigger"
        
# 进入busybox的网络命名空间
root@node2:~# nsenter -n -t 448037 -u bash
# 查看busybox的网卡信息和root命名空间一端的虚拟网卡ifindex
root@busybox-5587dd9dcc-mnn8h:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host 
       valid_lft forever preferred_lft forever
30: eth0@if31: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether de:92:74:30:23:ff brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet 10.233.65.192/32 scope global eth0
       valid_lft forever preferred_lft forever
    inet6 fe80::dc92:74ff:fe30:23ff/64 scope link 
       valid_lft forever preferred_lft forever
       
# 查看路由信息
root@busybox-5587dd9dcc-mnn8h:~# ip route
default via 10.233.65.227 dev eth0 mtu 1450 
10.233.65.227 dev eth0 scope link

# 查看eth0 veth pair的lxc网卡信息
root@busybox-5587dd9dcc-mnn8h:~# ethtool -S eth0
NIC statistics:
     peer_ifindex: 31
     rx_queue_0_xdp_packets: 0
     rx_queue_0_xdp_bytes: 0
     rx_queue_0_drops: 0
     rx_queue_0_xdp_redirect: 0
     rx_queue_0_xdp_drops: 0
     rx_queue_0_xdp_tx: 0
     rx_queue_0_xdp_tx_errors: 0
     tx_queue_0_xdp_xmit: 0
     tx_queue_0_xdp_xmit_errors: 0
# root命名空间一端busybox的虚拟网卡
root@node2:~# ip a | grep 31
31: lxccea7daed89b0@if30: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 1a:a8:3d:a8:c3:e9 brd ff:ff:ff:ff:ff:ff link-netns cni-82033108-1cad-7e52-eb96-914074d3efa6
    
# 查看改网卡上ingress方向的tc程序
root@node2:~# tc filter show dev lxccea7daed89b0 ingress
filter protocol all pref 1 bpf chain 0 
filter protocol all pref 1 bpf chain 0 handle 0x1 bpf_lxc.o:[from-container] direct-action not_in_hw id 31686 tag 2678facd09e74363 jited 


# 添加iptables trace 规则用于观察数据包经过iptables的处理
root@node2:~# iptables -t raw -I PREROUTING -p icmp -j TRACE
# 然后就可以观察日志
root@node2:~# tail -f /var/log/message
# 删除上面的iptables规则
root@node2:~# iptables -t raw -D PREROUTING 1

# 抓包经过lxccea7daed89b0的流量数据
root@node2:~# tcpdump -pne -i lxccea7daed89b0

# 抓包经过eth0的流量数据
root@busybox-5587dd9dcc-mnn8h:~# tcpdump -pne -i eth0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes

# 查看node2上cilium-agent管理的端点信息,其中id为3508是node2上busybox的端点信息
root@node2:/home/cilium# cilium bpf endpoint list
IP ADDRESS        LOCAL ENDPOINT INFO
192.168.64.9:0    (localhost)                                                                         
10.233.65.7:0     id=226   flags=0x0000 ifindex=29  mac=2A:F9:C2:40:CB:5A nodemac=DA:37:5B:AC:21:F2   
10.233.65.227:0   (localhost)                                                                         
10.233.65.41:0    id=2246  flags=0x0000 ifindex=15  mac=06:79:BE:64:23:E5 nodemac=96:AF:F0:92:98:F4   
10.233.65.234:0   id=1496  flags=0x0000 ifindex=65  mac=CE:C7:0A:5C:5C:AF nodemac=3E:CD:6D:44:2D:FE   
10.233.65.192:0   id=3508  flags=0x0000 ifindex=31  mac=DE:92:74:30:23:FF nodemac=1A:A8:3D:A8:C3:E9   
root@node2:/home/cilium# cilium monitor --related-to 3508 -vv
Press Ctrl-C to quit

# 进入nginx中获取root命名空间一端的网卡ifindex
root@node2:~# kubectl exec -it nginx-deployment-669895d846-bn9v7 -- sh
/ # ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host 
       valid_lft forever preferred_lft forever
28: eth0@if29: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1500 qdisc noqueue state UP qlen 1000
    link/ether 2a:f9:c2:40:cb:5a brd ff:ff:ff:ff:ff:ff
    inet 10.233.65.7/32 scope global eth0
       valid_lft forever preferred_lft forever
    inet6 fe80::28f9:c2ff:fe40:cb5a/64 scope link 
       valid_lft forever preferred_lft forever
/ # exit
# 获取root命名空间一端nginx的虚拟网卡
root@node2:~# ip a | grep 29
29: lxc89fe98f4c9ed@if28: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
root@node2:~# tcpdump -pne -i lxc89fe98f4c9ed
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxc89fe98f4c9ed, link-type EN10MB (Ethernet), snapshot length 262144 bytes

node2上的busybox中ping node2上的nginx服务

root@busybox-5587dd9dcc-mnn8h:~# ping 10.233.65.7 -c 1
PING 10.233.65.7 (10.233.65.7) 56(84) bytes of data.
64 bytes from 10.233.65.7: icmp_seq=1 ttl=63 time=4.12 ms

--- 10.233.65.7 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 4.120/4.120/4.120/0.000 ms

抓包点1-busybox的eth0

root@busybox-5587dd9dcc-mnn8h:~# tcpdump -pne -i eth0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
11:34:00.009616 de:92:74:30:23:ff > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.192, length 28
11:34:00.009663 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at 1a:a8:3d:a8:c3:e9, length 28
11:34:00.009684 de:92:74:30:23:ff > 1a:a8:3d:a8:c3:e9, ethertype IPv4 (0x0800), length 98: 10.233.65.192 > 10.233.65.7: ICMP echo request, id 33753, seq 1, length 64
11:34:00.012892 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.65.192: ICMP echo reply, id 33753, seq 1, length 64

抓包点2-busybox的root命名空间一端

root@node2:~# tcpdump -pne -i lxccea7daed89b0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxccea7daed89b0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
11:34:00.009630 de:92:74:30:23:ff > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.192, length 28
11:34:00.009650 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at 1a:a8:3d:a8:c3:e9, length 28
11:34:00.009694 de:92:74:30:23:ff > 1a:a8:3d:a8:c3:e9, ethertype IPv4 (0x0800), length 98: 10.233.65.192 > 10.233.65.7: ICMP echo request, id 33753, seq 1, length 64
11:34:00.012884 1a:a8:3d:a8:c3:e9 > de:92:74:30:23:ff, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.65.192: ICMP echo reply, id 33753, seq 1, length 64

抓包点3-nginx的root命名空间一端

root@node2:~# tcpdump -pne -i lxc89fe98f4c9ed
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxc89fe98f4c9ed, link-type EN10MB (Ethernet), snapshot length 262144 bytes
11:34:00.010447 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype IPv4 (0x0800), length 98: 10.233.65.192 > 10.233.65.7: ICMP echo request, id 33753, seq 1, length 64
11:34:00.012797 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.65.192: ICMP echo reply, id 33753, seq 1, length 64
11:34:05.149572 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.7, length 28
11:34:05.149882 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at da:37:5b:ac:21:f2, length 28

抓包点4-node2上cilium-agent中busybox的endpoint

root@node2:/home/cilium# cilium monitor --related-to 3508 -vv
Press Ctrl-C to quit
------------------------------------------------------------------------------
level=info msg="Initializing dissection cache..." subsys=monitor
Ethernet        {Contents=[..14..] Payload=[..86..] SrcMAC=1a:a8:3d:a8:c3:e9 DstMAC=de:92:74:30:23:ff EthernetType=IPv4 Length=0}
IPv4    {Contents=[..20..] Payload=[..64..] Version=4 IHL=5 TOS=0 Length=84 Id=28212 Flags= FragOffset=0 TTL=63 Protocol=ICMPv4 Checksum=29916 SrcIP=10.233.65.7 DstIP=10.233.65.192 Options=[] Padding=[]}
ICMPv4  {Contents=[..8..] Payload=[..56..] TypeCode=EchoReply Checksum=38269 Id=33753 Seq=1}
  Failed to decode layer: No decoder for layer type Payload
CPU 01: MARK 0x0 FROM 3508 to-endpoint: 98 bytes (98 captured), state reply, interface lxccea7daed89b0, , identity 5957->31615, orig-ip 10.233.65.7, to endpoint 3508

抓包点5-iptables规则链

root@node2:~# tail -f /var/log/message

通过上面的抓包可以看到,busybox的eth0和root命名空间一端的lxc网卡抓到的包是一样的,但是ipatables规则链中没有产生日志,因此可以说明数据包没有经过HOST NS的iptables规则链,再结合lxc网卡加载的[from-container] HOOK,这里的使用由redirect的能力。我们可以知道pod 发出流量后,流量会发给本 Pod 宿主机 lxc 接口的 tc ingress hook 的 eBPF 程序处理,eBPF 最终会查找目的 Pod,确定位于同一个节点,直接通过 redirect 方法将流量直接越过 kernel stack,送给目的 Pod 的 lxc 口,最终将流量送给目的 Pod。

Different Node

node1上的busybox中ping node2上的nginx服务

root@busybox-5587dd9dcc-mnn8h:~# ping 10.233.65.7 -c 1
PING 10.233.65.7 (10.233.65.7) 56(84) bytes of data.
64 bytes from 10.233.65.7: icmp_seq=1 ttl=63 time=4.12 ms

--- 10.233.65.7 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 4.120/4.120/4.120/0.000 ms

抓包点1-busybox的eth0

root@busybox-5587dd9dcc-8x25g:~# tcpdump -pne -i eth0
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
14:28:47.446003 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype IPv4 (0x0800), length 98: 10.233.64.174 > 10.233.65.7: ICMP echo request, id 11395, seq 0, length 64
14:28:47.446742 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.64.174: ICMP echo reply, id 11395, seq 0, length 64
14:28:52.638498 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype ARP (0x0806), length 42: Request who-has 10.233.64.93 tell 10.233.64.174, length 28
14:28:52.638939 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype ARP (0x0806), length 42: Reply 10.233.64.93 is-at 32:12:64:c2:23:5b, length 28

抓包点2-busybox的root命名空间一端

root@node1:~# tcpdump -pne -i lxca58908f2f283
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxca58908f2f283, link-type EN10MB (Ethernet), snapshot length 262144 bytes
14:28:47.446011 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype IPv4 (0x0800), length 98: 10.233.64.174 > 10.233.65.7: ICMP echo request, id 11395, seq 0, length 64
14:28:47.446735 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.64.174: ICMP echo reply, id 11395, seq 0, length 64
14:28:52.638509 fe:91:05:b3:8a:de > 32:12:64:c2:23:5b, ethertype ARP (0x0806), length 42: Request who-has 10.233.64.93 tell 10.233.64.174, length 28
14:28:52.638928 32:12:64:c2:23:5b > fe:91:05:b3:8a:de, ethertype ARP (0x0806), length 42: Reply 10.233.64.93 is-at 32:12:64:c2:23:5b, length 28

抓包点3-nginx的root命名空间一端

root@node2:~# tcpdump -pne -i lxc89fe98f4c9ed
14:28:47.459487 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype IPv4 (0x0800), length 98: 10.233.64.174 > 10.233.65.7: ICMP echo request, id 11395, seq 0, length 64
14:28:47.459514 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype IPv4 (0x0800), length 98: 10.233.65.7 > 10.233.64.174: ICMP echo reply, id 11395, seq 0, length 64
14:28:52.702616 2a:f9:c2:40:cb:5a > da:37:5b:ac:21:f2, ethertype ARP (0x0806), length 42: Request who-has 10.233.65.227 tell 10.233.65.7, length 28
14:28:52.703103 da:37:5b:ac:21:f2 > 2a:f9:c2:40:cb:5a, ethertype ARP (0x0806), length 42: Reply 10.233.65.227 is-at da:37:5b:ac:21:f2, length 28

抓包点4-node2上cilium-agent中busybox的endpoint

root@node1:/home/cilium# cilium monitor --related-to 443 -vv
Press Ctrl-C to quit
------------------------------------------------------------------------------
level=info msg="Initializing dissection cache..." subsys=monitor
Ethernet        {Contents=[..14..] Payload=[..86..] SrcMAC=fe:91:05:b3:8a:de DstMAC=32:12:64:c2:23:5b EthernetType=IPv4 Length=0}
IPv4    {Contents=[..20..] Payload=[..64..] Version=4 IHL=5 TOS=0 Length=84 Id=48335 Flags=DF FragOffset=0 TTL=64 Protocol=ICMPv4 Checksum=58962 SrcIP=10.233.64.174 DstIP=10.233.65.7 Options=[] Padding=[]}
ICMPv4  {Contents=[..8..] Payload=[..56..] TypeCode=EchoRequest Checksum=58151 Id=11088 Seq=0}
  Failed to decode layer: No decoder for layer type Payload
CPU 01: MARK 0x0 FROM 443 to-overlay: 98 bytes (98 captured), state new, interface cilium_vxlan, , identity 31615->unknown, orig-ip 0.0.0.0
------------------------------------------------------------------------------
Ethernet        {Contents=[..14..] Payload=[..86..] SrcMAC=32:12:64:c2:23:5b DstMAC=fe:91:05:b3:8a:de EthernetType=IPv4 Length=0}
IPv4    {Contents=[..20..] Payload=[..64..] Version=4 IHL=5 TOS=0 Length=84 Id=47340 Flags= FragOffset=0 TTL=63 Protocol=ICMPv4 Checksum=11062 SrcIP=10.233.65.7 DstIP=10.233.64.174 Options=[] Padding=[]}
ICMPv4  {Contents=[..8..] Payload=[..56..] TypeCode=EchoReply Checksum=60199 Id=11088 Seq=0}
  Failed to decode layer: No decoder for layer type Payload
CPU 01: MARK 0x0 FROM 443 to-endpoint: 98 bytes (98 captured), state reply, interface lxca58908f2f283, , identity 5957->31615, orig-ip 10.233.65.7, to endpoint 443

VxLAN设备 tc 程序

root@node1:~# tc filter show dev cilium_vxlan ingress
filter protocol all pref 1 bpf chain 0 
filter protocol all pref 1 bpf chain 0 handle 0x1 bpf_overlay.o:[from-overlay] direct-action not_in_hw id 38054 tag 8dadd616a2c190d7 jited 
root@node1:~# tc filter show dev cilium_vxlan egress
filter protocol all pref 1 bpf chain 0 
filter protocol all pref 1 bpf chain 0 handle 0x1 bpf_overlay.o:[to-overlay] direct-action not_in_hw id 38063 tag 634ef5728fd44f7a jited

通过上面的抓包看一看到不同节点pod相互访问时走的vxlan隧道,因为在node1的cilium-agent监控可以看到不同于同节点的是多了一条to-overlay的请求数据包,并且经过vxlan封装的数据包此时会被HOST NS中的iptables处理。