cilium ebpf helper函数bpf_redirect/peer/neigh

前面分析不同模式下报文路径时,用到了如下几个bpf helper函数,对转发性能提供了很大帮忙,这里分析下他们的
实现原理,先概述下每个函数的作用

bpf_redirect: 将报文重定向到指定接口,通过参数flags表示执行入向还是出向流程
bpf_redirect_peer: 将报文重定向到指定接口的peer,比较典型的是veth的peer,执行的是入向流程
bpf_redirect_neigh: 将报文重定向到指定接口,其中会查找路由表和邻居表后直接将报文从出接口发出

函数源码分析

enum {
	BPF_F_INGRESS			= (1ULL << 0),
};
/* Internal, non-exposed redirect flags. */
enum {
	BPF_F_NEIGH	= (1ULL << 1),
	BPF_F_PEER	= (1ULL << 2),
	BPF_F_NEXTHOP	= (1ULL << 3),
#define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};

bpf_redirect
设置接口索引和flags,如果flags为BPF_F_INGRESS,则将skb->dev设置为ifindex指定的dev,并将报文enqueue_to_backlog到cpu队列,下次软中断再处理报文,就像报文从ifindex指定的接口接收。如果不指定BPF_F_INGRESS,则执行dev_queue_xmit将报文从ifindex指定的接口发送出去

BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

	if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
		return TC_ACT_SHOT;

	ri->flags = flags;
	ri->tgt_index = ifindex;

	return TC_ACT_REDIRECT;
}

bpf_redirect_peer
设置接口索引,并将ri->flags设置为BPF_F_PEER,参数flags只能为0

BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

	if (unlikely(flags))
		return TC_ACT_SHOT;

	ri->flags = BPF_F_PEER;
	ri->tgt_index = ifindex;

	return TC_ACT_REDIRECT;
}

bpf_redirect_neigh
设置接口索引,邻居参数等

BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
	   int, plen, u64, flags)
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

	if (unlikely((plen && plen < sizeof(*params)) || flags))
		return TC_ACT_SHOT;

	ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
	ri->tgt_index = ifindex;

	BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
	if (plen)
		memcpy(&ri->nh, params, sizeof(ri->nh));

	return TC_ACT_REDIRECT;
}

上面三个bpf helper函数实现很简单,只是设置flag和接口索引,并没有真正执行报文重定向的处理,但都会返回 TC_ACT_REDIRECT。

执行ebpf程序(调用上面的bpf helper函数)的函数会根据返回值进行处理,如果返回值是TC_ACT_REDIRECT才真正将报文进行重定向。
执行epbf程序的位置有如下两个,即tc的入向和出向。
协议栈入向时执行tc上的ebpf程序

static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
		   struct net_device *orig_dev, bool *another)
	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
	...
	如果返回值为 TC_ACT_REDIRECT,则将报文重定向到目标设备
	case TC_ACT_REDIRECT:
		/* skb_mac_header check was done by cls/act_bpf, so
		 * we can safely push the L2 header back before
		 * redirecting to another netdev
		 */
		__skb_push(skb, skb->mac_len);
		if (skb_do_redirect(skb) == -EAGAIN) {
			__skb_pull(skb, skb->mac_len);
			//如果返回值为-EAGAIN,则设置another为true,表示要goto到another重新走一遍协议栈
			//只有经过bpf_redirect_peer设置后才会返回EAGAIN
			*another = true;
			break;
		}
		return NULL;
	...
	}

出向时执行tc上的ebpf程序

sch_handle_egress
	//执行ebpf程序
	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
	...
	如果返回值为 TC_ACT_REDIRECT,则将报文重定向到目标设备
	case TC_ACT_REDIRECT:
		/* No need to push/pop skb's mac_header here on egress! */
		skb_do_redirect(skb);
		*ret = NET_XMIT_SUCCESS;
		return NULL;
	default:
		break;
	}

不管入向还是出向,最终都会调用skb_do_redirect,执行真正的重定向流程

int skb_do_redirect(struct sk_buff *skb)
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
	struct net *net = dev_net(skb->dev);
	struct net_device *dev;
	u32 flags = ri->flags;

	//根据tgt_index获取dev
	dev = dev_get_by_index_rcu(net, ri->tgt_index);
	ri->tgt_index = 0;
	ri->flags = 0;
	if (unlikely(!dev))
		goto out_drop;

	//bpf_redirect_peer的处理
	//如果需要重定向到peer设备,则通过ndo_get_peer_dev(比如veth_peer_dev)获取peer的dev,赋值到skb->dev
	if (flags & BPF_F_PEER) {
		const struct net_device_ops *ops = dev->netdev_ops;
		...
		dev = ops->ndo_get_peer_dev(dev);
		...
		skb->dev = dev;
		//返回-EAGAIN,重新执行协议栈处理。
		//这里就能看出redirect_peer的优势,返回EAGAIN后,直接在本次软中断就能将报文送到pod内部处理,而redirect只能将报文
		//插入cpu的backlog队列,等待下次软中断的处理才能达到pod内部
		return -EAGAIN;
	}
	return flags & BPF_F_NEIGH ?
	       __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
				    &ri->nh : NULL) :
	       __bpf_redirect(skb, dev, flags);
out_drop:
	kfree_skb(skb);
	return -EINVAL;
}

bpf_redirect的处理

static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags)
{
	if (dev_is_mac_header_xmit(dev))
		return __bpf_redirect_common(skb, dev, flags);
	else
		return __bpf_redirect_no_mac(skb, dev, flags);
}

static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags)
{
	/* Verify that a link layer header is carried */
	if (unlikely(skb->mac_header >= skb->network_header)) {
		kfree_skb(skb);
		return -ERANGE;
	}

	bpf_push_mac_rcsum(skb);
	//如果flags指定了BPF_F_INGRESS,则执行__bpf_rx_skb,否则执行__bpf_tx_skb
	return flags & BPF_F_INGRESS ?
	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}

执行入向流程

static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
	return dev_forward_skb(dev, skb);
}

int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
	//先调用__dev_forward_skb清除skb中的信息,返回值为0,再调用netif_rx_internal将skb放入cpu的backlog队列
	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
	int ret = ____dev_forward_skb(dev, skb);

	if (likely(!ret)) {
		skb->protocol = eth_type_trans(skb, dev);
		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
	}

	return ret;
}

static __always_inline int ____dev_forward_skb(struct net_device *dev,
					       struct sk_buff *skb)
{
	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
	    unlikely(!is_skb_forwardable(dev, skb))) {
		atomic_long_inc(&dev->rx_dropped);
		kfree_skb(skb);
		return NET_RX_DROP;
	}

	//清除skb中不必要的信息
	skb_scrub_packet(skb, true);
	skb->priority = 0;
	return 0;
}

/**
 * skb_scrub_packet - scrub an skb
 *
 * @skb: buffer to clean
 * @xnet: packet is crossing netns
 *
 * skb_scrub_packet can be used after encapsulating or decapsulting a packet
 * into/from a tunnel. Some information have to be cleared during these
 * operations.
 * skb_scrub_packet can also be used to clean a skb before injecting it in
 * another namespace (@xnet == true). We have to clear all information in the
 * skb that could impact namespace isolation.
 */
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
	skb->pkt_type = PACKET_HOST;
	skb->skb_iif = 0;
	skb->ignore_df = 0;
	skb_dst_drop(skb);
	skb_ext_reset(skb);
	nf_reset_ct(skb);
	nf_reset_trace(skb);

#ifdef CONFIG_NET_SWITCHDEV
	skb->offload_fwd_mark = 0;
	skb->offload_l3_fwd_mark = 0;
#endif

	if (!xnet)
		return;

	ipvs_reset(skb);
	skb->mark = 0;
	skb->tstamp = 0;
}

执行出向流程

static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
	int ret;

	if (dev_xmit_recursion()) {
		net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
		kfree_skb(skb);
		return -ENETDOWN;
	}

	skb->dev = dev;
	skb->tstamp = 0;

	dev_xmit_recursion_inc();
	//将报文从dev发送出去
	ret = dev_queue_xmit(skb);
	dev_xmit_recursion_dec();

	return ret;
}

bpf_redirect_neigh的处理

static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
				struct bpf_nh_params *nh)
{
	struct ethhdr *ethh = eth_hdr(skb);

	if (unlikely(skb->mac_header >= skb->network_header))
		goto out;
	bpf_push_mac_rcsum(skb);
	if (is_multicast_ether_addr(ethh->h_dest))
		goto out;

	skb_pull(skb, sizeof(*ethh));
	skb_unset_mac_header(skb);
	skb_reset_network_header(skb);

	if (skb->protocol == htons(ETH_P_IP))
		return __bpf_redirect_neigh_v4(skb, dev, nh);
	else if (skb->protocol == htons(ETH_P_IPV6))
		return __bpf_redirect_neigh_v6(skb, dev, nh);
out:
	kfree_skb(skb);
	return -ENOTSUPP;
}


static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
				   struct bpf_nh_params *nh)
{
	const struct iphdr *ip4h = ip_hdr(skb);
	struct net *net = dev_net(dev);
	int err, ret = NET_XMIT_DROP;

	//如果没有指定下一跳信息,则查找路由表
	if (!nh) {
		struct flowi4 fl4 = {
			.flowi4_flags = FLOWI_FLAG_ANYSRC,
			.flowi4_mark  = skb->mark,
			.flowi4_tos   = RT_TOS(ip4h->tos),
			.flowi4_oif   = dev->ifindex,
			.flowi4_proto = ip4h->protocol,
			.daddr	      = ip4h->daddr,
			.saddr	      = ip4h->saddr,
		};
		struct rtable *rt;

		rt = ip_route_output_flow(net, &fl4, NULL);
		if (IS_ERR(rt))
			goto out_drop;
		if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
			ip_rt_put(rt);
			goto out_drop;
		}

		//将路由信息保存到skb
		skb_dst_set(skb, &rt->dst);
	}

	err = bpf_out_neigh_v4(net, skb, dev, nh);
	if (unlikely(net_xmit_eval(err)))
		dev->stats.tx_errors++;
	else
		ret = NET_XMIT_SUCCESS;
	goto out_xmit;
out_drop:
	dev->stats.tx_errors++;
	kfree_skb(skb);
out_xmit:
	return ret;
}

static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
			    struct net_device *dev, struct bpf_nh_params *nh)
{
	u32 hh_len = LL_RESERVED_SPACE(dev);
	struct neighbour *neigh;
	bool is_v6gw = false;

	if (dev_xmit_recursion()) {
		net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
		goto out_drop;
	}

	skb->dev = dev;
	skb->tstamp = 0;

	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, hh_len);
		if (unlikely(!skb2)) {
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
		consume_skb(skb);
		skb = skb2;
	}

	rcu_read_lock_bh();
	//没指定下一跳信息的情况下,根据上一步查到的路由信息进行处理
	if (!nh) {
		struct dst_entry *dst = skb_dst(skb);
		struct rtable *rt = container_of(dst, struct rtable, dst);

		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
	} else if (nh->nh_family == AF_INET6) {
		neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
		is_v6gw = true;
	} else if (nh->nh_family == AF_INET) {
		//查找邻居表,如果查不到则创建对应的邻居表项
		neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
	} else {
		rcu_read_unlock_bh();
		goto out_drop;
	}

	if (likely(!IS_ERR(neigh))) {
		int ret;

		sock_confirm_neigh(skb, neigh);
		dev_xmit_recursion_inc();
		//执行邻居子系统流程,如果有neigh状态为NUD_CONNECTED,则直接调用dev_queue_xmit发送报文,
		//否则需要先将报文缓存下来,发送arp请求学习对端mac
		ret = neigh_output(neigh, skb, is_v6gw);
		dev_xmit_recursion_dec();
		rcu_read_unlock_bh();
		return ret;
	}
	rcu_read_unlock_bh();
out_drop:
	kfree_skb(skb);
	return -ENETDOWN;
}

static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
						struct sk_buff *skb,
						bool *is_v6gw)
{
	struct net_device *dev = rt->dst.dev;
	struct neighbour *neigh;

	if (likely(rt->rt_gw_family == AF_INET)) {
		neigh = ip_neigh_gw4(dev, rt->rt_gw4);
	} else if (rt->rt_gw_family == AF_INET6) {
		neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
		*is_v6gw = true;
	} else {
		neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
	}
	return neigh;
}

static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
					     __be32 daddr)
{
	struct neighbour *neigh;

	//查找邻居表
	neigh = __ipv4_neigh_lookup_noref(dev, daddr);
	if (unlikely(!neigh))
		//查找失败,创建邻居表项
		neigh = __neigh_create(&arp_tbl, &daddr, dev, false);

	return neigh;
}

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache)
{
	const struct hh_cache *hh = &n->hh;

	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)
		return neigh_hh_output(hh, skb);
	else
		return n->output(n, skb);
}