cilium ebpf helper函数bpf_redirect/peer/neigh
前面分析不同模式下报文路径时,用到了如下几个bpf helper函数,对转发性能提供了很大帮忙,这里分析下他们的
实现原理,先概述下每个函数的作用
bpf_redirect: 将报文重定向到指定接口,通过参数flags表示执行入向还是出向流程
bpf_redirect_peer: 将报文重定向到指定接口的peer,比较典型的是veth的peer,执行的是入向流程
bpf_redirect_neigh: 将报文重定向到指定接口,其中会查找路由表和邻居表后直接将报文从出接口发出
函数源码分析
enum {
BPF_F_INGRESS = (1ULL << 0),
};
/* Internal, non-exposed redirect flags. */
enum {
BPF_F_NEIGH = (1ULL << 1),
BPF_F_PEER = (1ULL << 2),
BPF_F_NEXTHOP = (1ULL << 3),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};
bpf_redirect
设置接口索引和flags,如果flags为BPF_F_INGRESS,则将skb->dev设置为ifindex指定的dev,并将报文enqueue_to_backlog到cpu队列,下次软中断再处理报文,就像报文从ifindex指定的接口接收。如果不指定BPF_F_INGRESS,则执行dev_queue_xmit将报文从ifindex指定的接口发送出去
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return TC_ACT_SHOT;
ri->flags = flags;
ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
bpf_redirect_peer
设置接口索引,并将ri->flags设置为BPF_F_PEER,参数flags只能为0
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags))
return TC_ACT_SHOT;
ri->flags = BPF_F_PEER;
ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
bpf_redirect_neigh
设置接口索引,邻居参数等
BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
int, plen, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely((plen && plen < sizeof(*params)) || flags))
return TC_ACT_SHOT;
ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
ri->tgt_index = ifindex;
BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
if (plen)
memcpy(&ri->nh, params, sizeof(ri->nh));
return TC_ACT_REDIRECT;
}
上面三个bpf helper函数实现很简单,只是设置flag和接口索引,并没有真正执行报文重定向的处理,但都会返回 TC_ACT_REDIRECT。
执行ebpf程序(调用上面的bpf helper函数)的函数会根据返回值进行处理,如果返回值是TC_ACT_REDIRECT才真正将报文进行重定向。
执行epbf程序的位置有如下两个,即tc的入向和出向。
协议栈入向时执行tc上的ebpf程序
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev, bool *another)
switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
...
如果返回值为 TC_ACT_REDIRECT,则将报文重定向到目标设备
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
* we can safely push the L2 header back before
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
//如果返回值为-EAGAIN,则设置another为true,表示要goto到another重新走一遍协议栈
//只有经过bpf_redirect_peer设置后才会返回EAGAIN
*another = true;
break;
}
return NULL;
...
}
出向时执行tc上的ebpf程序
sch_handle_egress
//执行ebpf程序
switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
...
如果返回值为 TC_ACT_REDIRECT,则将报文重定向到目标设备
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
return NULL;
default:
break;
}
不管入向还是出向,最终都会调用skb_do_redirect,执行真正的重定向流程
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
//根据tgt_index获取dev
dev = dev_get_by_index_rcu(net, ri->tgt_index);
ri->tgt_index = 0;
ri->flags = 0;
if (unlikely(!dev))
goto out_drop;
//bpf_redirect_peer的处理
//如果需要重定向到peer设备,则通过ndo_get_peer_dev(比如veth_peer_dev)获取peer的dev,赋值到skb->dev
if (flags & BPF_F_PEER) {
const struct net_device_ops *ops = dev->netdev_ops;
...
dev = ops->ndo_get_peer_dev(dev);
...
skb->dev = dev;
//返回-EAGAIN,重新执行协议栈处理。
//这里就能看出redirect_peer的优势,返回EAGAIN后,直接在本次软中断就能将报文送到pod内部处理,而redirect只能将报文
//插入cpu的backlog队列,等待下次软中断的处理才能达到pod内部
return -EAGAIN;
}
return flags & BPF_F_NEIGH ?
__bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
&ri->nh : NULL) :
__bpf_redirect(skb, dev, flags);
out_drop:
kfree_skb(skb);
return -EINVAL;
}
bpf_redirect的处理
static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags)
{
if (dev_is_mac_header_xmit(dev))
return __bpf_redirect_common(skb, dev, flags);
else
return __bpf_redirect_no_mac(skb, dev, flags);
}
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags)
{
/* Verify that a link layer header is carried */
if (unlikely(skb->mac_header >= skb->network_header)) {
kfree_skb(skb);
return -ERANGE;
}
bpf_push_mac_rcsum(skb);
//如果flags指定了BPF_F_INGRESS,则执行__bpf_rx_skb,否则执行__bpf_tx_skb
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}
执行入向流程
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
return dev_forward_skb(dev, skb);
}
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
//先调用__dev_forward_skb清除skb中的信息,返回值为0,再调用netif_rx_internal将skb放入cpu的backlog队列
return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret = ____dev_forward_skb(dev, skb);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
return ret;
}
static __always_inline int ____dev_forward_skb(struct net_device *dev,
struct sk_buff *skb)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
//清除skb中不必要的信息
skb_scrub_packet(skb, true);
skb->priority = 0;
return 0;
}
/**
* skb_scrub_packet - scrub an skb
*
* @skb: buffer to clean
* @xnet: packet is crossing netns
*
* skb_scrub_packet can be used after encapsulating or decapsulting a packet
* into/from a tunnel. Some information have to be cleared during these
* operations.
* skb_scrub_packet can also be used to clean a skb before injecting it in
* another namespace (@xnet == true). We have to clear all information in the
* skb that could impact namespace isolation.
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
skb_ext_reset(skb);
nf_reset_ct(skb);
nf_reset_trace(skb);
#ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0;
skb->offload_l3_fwd_mark = 0;
#endif
if (!xnet)
return;
ipvs_reset(skb);
skb->mark = 0;
skb->tstamp = 0;
}
执行出向流程
static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret;
if (dev_xmit_recursion()) {
net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
kfree_skb(skb);
return -ENETDOWN;
}
skb->dev = dev;
skb->tstamp = 0;
dev_xmit_recursion_inc();
//将报文从dev发送出去
ret = dev_queue_xmit(skb);
dev_xmit_recursion_dec();
return ret;
}
bpf_redirect_neigh的处理
static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{
struct ethhdr *ethh = eth_hdr(skb);
if (unlikely(skb->mac_header >= skb->network_header))
goto out;
bpf_push_mac_rcsum(skb);
if (is_multicast_ether_addr(ethh->h_dest))
goto out;
skb_pull(skb, sizeof(*ethh));
skb_unset_mac_header(skb);
skb_reset_network_header(skb);
if (skb->protocol == htons(ETH_P_IP))
return __bpf_redirect_neigh_v4(skb, dev, nh);
else if (skb->protocol == htons(ETH_P_IPV6))
return __bpf_redirect_neigh_v6(skb, dev, nh);
out:
kfree_skb(skb);
return -ENOTSUPP;
}
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{
const struct iphdr *ip4h = ip_hdr(skb);
struct net *net = dev_net(dev);
int err, ret = NET_XMIT_DROP;
//如果没有指定下一跳信息,则查找路由表
if (!nh) {
struct flowi4 fl4 = {
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.flowi4_mark = skb->mark,
.flowi4_tos = RT_TOS(ip4h->tos),
.flowi4_oif = dev->ifindex,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
.saddr = ip4h->saddr,
};
struct rtable *rt;
rt = ip_route_output_flow(net, &fl4, NULL);
if (IS_ERR(rt))
goto out_drop;
if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
ip_rt_put(rt);
goto out_drop;
}
//将路由信息保存到skb
skb_dst_set(skb, &rt->dst);
}
err = bpf_out_neigh_v4(net, skb, dev, nh);
if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++;
else
ret = NET_XMIT_SUCCESS;
goto out_xmit;
out_drop:
dev->stats.tx_errors++;
kfree_skb(skb);
out_xmit:
return ret;
}
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
struct net_device *dev, struct bpf_nh_params *nh)
{
u32 hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
bool is_v6gw = false;
if (dev_xmit_recursion()) {
net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
goto out_drop;
}
skb->dev = dev;
skb->tstamp = 0;
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, hh_len);
if (unlikely(!skb2)) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
}
rcu_read_lock_bh();
//没指定下一跳信息的情况下,根据上一步查到的路由信息进行处理
if (!nh) {
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = container_of(dst, struct rtable, dst);
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
} else if (nh->nh_family == AF_INET6) {
neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
is_v6gw = true;
} else if (nh->nh_family == AF_INET) {
//查找邻居表,如果查不到则创建对应的邻居表项
neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
} else {
rcu_read_unlock_bh();
goto out_drop;
}
if (likely(!IS_ERR(neigh))) {
int ret;
sock_confirm_neigh(skb, neigh);
dev_xmit_recursion_inc();
//执行邻居子系统流程,如果有neigh状态为NUD_CONNECTED,则直接调用dev_queue_xmit发送报文,
//否则需要先将报文缓存下来,发送arp请求学习对端mac
ret = neigh_output(neigh, skb, is_v6gw);
dev_xmit_recursion_dec();
rcu_read_unlock_bh();
return ret;
}
rcu_read_unlock_bh();
out_drop:
kfree_skb(skb);
return -ENETDOWN;
}
static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
struct sk_buff *skb,
bool *is_v6gw)
{
struct net_device *dev = rt->dst.dev;
struct neighbour *neigh;
if (likely(rt->rt_gw_family == AF_INET)) {
neigh = ip_neigh_gw4(dev, rt->rt_gw4);
} else if (rt->rt_gw_family == AF_INET6) {
neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
*is_v6gw = true;
} else {
neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
}
return neigh;
}
static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
__be32 daddr)
{
struct neighbour *neigh;
//查找邻居表
neigh = __ipv4_neigh_lookup_noref(dev, daddr);
if (unlikely(!neigh))
//查找失败,创建邻居表项
neigh = __neigh_create(&arp_tbl, &daddr, dev, false);
return neigh;
}
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache)
{
const struct hh_cache *hh = &n->hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)
return neigh_hh_output(hh, skb);
else
return n->output(n, skb);
}