Skip to content

Commit b4ab314

Browse files
borkmannAlexei Starovoitov
authored and
Alexei Starovoitov
committed
bpf: Add redirect_neigh helper as redirect drop-in
Add a redirect_neigh() helper as redirect() drop-in replacement for the xmit side. Main idea for the helper is to be very similar in semantics to the latter just that the skb gets injected into the neighboring subsystem in order to let the stack do the work it knows best anyway to populate the L2 addresses of the packet and then hand over to dev_queue_xmit() as redirect() does. This solves two bigger items: i) skbs don't need to go up to the stack on the host facing veth ingress side for traffic egressing the container to achieve the same for populating L2 which also has the huge advantage that ii) the skb->sk won't get orphaned in ip_rcv_core() when entering the IP routing layer on the host stack. Given that skb->sk neither gets orphaned when crossing the netns as per 9c4c325 ("skbuff: preserve sock reference when scrubbing the skb.") the helper can then push the skbs directly to the phys device where FQ scheduler can do its work and TCP stack gets proper backpressure given we hold on to skb->sk as long as skb is still residing in queues. With the helper used in BPF data path to then push the skb to the phys device, I observed a stable/consistent TCP_STREAM improvement on veth devices for traffic going container -> host -> host -> container from ~10Gbps to ~15Gbps for a single stream in my test environment. Signed-off-by: Daniel Borkmann <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Reviewed-by: David Ahern <[email protected]> Acked-by: Martin KaFai Lau <[email protected]> Cc: David Ahern <[email protected]> Link: https://lore.kernel.org/bpf/f207de81629e1724899b73b8112e0013be782d35.1601477936.git.daniel@iogearbox.net
1 parent 92acdc5 commit b4ab314

File tree

4 files changed

+294
-15
lines changed

4 files changed

+294
-15
lines changed

include/linux/skbuff.h

+5
Original file line numberDiff line numberDiff line change
@@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb)
25482548
return skb->mac_header != (typeof(skb->mac_header))~0U;
25492549
}
25502550

2551+
static inline void skb_unset_mac_header(struct sk_buff *skb)
2552+
{
2553+
skb->mac_header = (typeof(skb->mac_header))~0U;
2554+
}
2555+
25512556
static inline void skb_reset_mac_header(struct sk_buff *skb)
25522557
{
25532558
skb->mac_header = skb->data - skb->head;

include/uapi/linux/bpf.h

+14
Original file line numberDiff line numberDiff line change
@@ -3652,6 +3652,19 @@ union bpf_attr {
36523652
* associated socket instead of the current process.
36533653
* Return
36543654
* The id is returned or 0 in case the id could not be retrieved.
3655+
*
3656+
* long bpf_redirect_neigh(u32 ifindex, u64 flags)
3657+
* Description
3658+
* Redirect the packet to another net device of index *ifindex*
3659+
* and fill in L2 addresses from neighboring subsystem. This helper
3660+
* is somewhat similar to **bpf_redirect**\ (), except that it
3661+
* fills in e.g. MAC addresses based on the L3 information from
3662+
* the packet. This helper is supported for IPv4 and IPv6 protocols.
3663+
* The *flags* argument is reserved and must be 0. The helper is
3664+
* currently only supported for tc BPF program types.
3665+
* Return
3666+
* The helper returns **TC_ACT_REDIRECT** on success or
3667+
* **TC_ACT_SHOT** on error.
36553668
*/
36563669
#define __BPF_FUNC_MAPPER(FN) \
36573670
FN(unspec), \
@@ -3806,6 +3819,7 @@ union bpf_attr {
38063819
FN(snprintf_btf), \
38073820
FN(seq_printf_btf), \
38083821
FN(skb_cgroup_classid), \
3822+
FN(redirect_neigh), \
38093823
/* */
38103824

38113825
/* integer value in 'imm' field of BPF_CALL instruction selects which helper

net/core/filter.c

+261-15
Original file line numberDiff line numberDiff line change
@@ -2163,13 +2163,233 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
21632163
return __bpf_redirect_no_mac(skb, dev, flags);
21642164
}
21652165

2166+
#if IS_ENABLED(CONFIG_IPV6)
2167+
static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
2168+
{
2169+
struct dst_entry *dst = skb_dst(skb);
2170+
struct net_device *dev = dst->dev;
2171+
u32 hh_len = LL_RESERVED_SPACE(dev);
2172+
const struct in6_addr *nexthop;
2173+
struct neighbour *neigh;
2174+
2175+
if (dev_xmit_recursion()) {
2176+
net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2177+
goto out_drop;
2178+
}
2179+
2180+
skb->dev = dev;
2181+
skb->tstamp = 0;
2182+
2183+
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2184+
struct sk_buff *skb2;
2185+
2186+
skb2 = skb_realloc_headroom(skb, hh_len);
2187+
if (unlikely(!skb2)) {
2188+
kfree_skb(skb);
2189+
return -ENOMEM;
2190+
}
2191+
if (skb->sk)
2192+
skb_set_owner_w(skb2, skb->sk);
2193+
consume_skb(skb);
2194+
skb = skb2;
2195+
}
2196+
2197+
rcu_read_lock_bh();
2198+
nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
2199+
&ipv6_hdr(skb)->daddr);
2200+
neigh = ip_neigh_gw6(dev, nexthop);
2201+
if (likely(!IS_ERR(neigh))) {
2202+
int ret;
2203+
2204+
sock_confirm_neigh(skb, neigh);
2205+
dev_xmit_recursion_inc();
2206+
ret = neigh_output(neigh, skb, false);
2207+
dev_xmit_recursion_dec();
2208+
rcu_read_unlock_bh();
2209+
return ret;
2210+
}
2211+
rcu_read_unlock_bh();
2212+
IP6_INC_STATS(dev_net(dst->dev),
2213+
ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
2214+
out_drop:
2215+
kfree_skb(skb);
2216+
return -ENETDOWN;
2217+
}
2218+
2219+
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
2220+
{
2221+
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
2222+
struct net *net = dev_net(dev);
2223+
int err, ret = NET_XMIT_DROP;
2224+
struct dst_entry *dst;
2225+
struct flowi6 fl6 = {
2226+
.flowi6_flags = FLOWI_FLAG_ANYSRC,
2227+
.flowi6_mark = skb->mark,
2228+
.flowlabel = ip6_flowinfo(ip6h),
2229+
.flowi6_oif = dev->ifindex,
2230+
.flowi6_proto = ip6h->nexthdr,
2231+
.daddr = ip6h->daddr,
2232+
.saddr = ip6h->saddr,
2233+
};
2234+
2235+
dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
2236+
if (IS_ERR(dst))
2237+
goto out_drop;
2238+
2239+
skb_dst_set(skb, dst);
2240+
2241+
err = bpf_out_neigh_v6(net, skb);
2242+
if (unlikely(net_xmit_eval(err)))
2243+
dev->stats.tx_errors++;
2244+
else
2245+
ret = NET_XMIT_SUCCESS;
2246+
goto out_xmit;
2247+
out_drop:
2248+
dev->stats.tx_errors++;
2249+
kfree_skb(skb);
2250+
out_xmit:
2251+
return ret;
2252+
}
2253+
#else
2254+
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
2255+
{
2256+
kfree_skb(skb);
2257+
return NET_XMIT_DROP;
2258+
}
2259+
#endif /* CONFIG_IPV6 */
2260+
2261+
#if IS_ENABLED(CONFIG_INET)
2262+
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
2263+
{
2264+
struct dst_entry *dst = skb_dst(skb);
2265+
struct rtable *rt = container_of(dst, struct rtable, dst);
2266+
struct net_device *dev = dst->dev;
2267+
u32 hh_len = LL_RESERVED_SPACE(dev);
2268+
struct neighbour *neigh;
2269+
bool is_v6gw = false;
2270+
2271+
if (dev_xmit_recursion()) {
2272+
net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2273+
goto out_drop;
2274+
}
2275+
2276+
skb->dev = dev;
2277+
skb->tstamp = 0;
2278+
2279+
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2280+
struct sk_buff *skb2;
2281+
2282+
skb2 = skb_realloc_headroom(skb, hh_len);
2283+
if (unlikely(!skb2)) {
2284+
kfree_skb(skb);
2285+
return -ENOMEM;
2286+
}
2287+
if (skb->sk)
2288+
skb_set_owner_w(skb2, skb->sk);
2289+
consume_skb(skb);
2290+
skb = skb2;
2291+
}
2292+
2293+
rcu_read_lock_bh();
2294+
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
2295+
if (likely(!IS_ERR(neigh))) {
2296+
int ret;
2297+
2298+
sock_confirm_neigh(skb, neigh);
2299+
dev_xmit_recursion_inc();
2300+
ret = neigh_output(neigh, skb, is_v6gw);
2301+
dev_xmit_recursion_dec();
2302+
rcu_read_unlock_bh();
2303+
return ret;
2304+
}
2305+
rcu_read_unlock_bh();
2306+
out_drop:
2307+
kfree_skb(skb);
2308+
return -ENETDOWN;
2309+
}
2310+
2311+
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
2312+
{
2313+
const struct iphdr *ip4h = ip_hdr(skb);
2314+
struct net *net = dev_net(dev);
2315+
int err, ret = NET_XMIT_DROP;
2316+
struct rtable *rt;
2317+
struct flowi4 fl4 = {
2318+
.flowi4_flags = FLOWI_FLAG_ANYSRC,
2319+
.flowi4_mark = skb->mark,
2320+
.flowi4_tos = RT_TOS(ip4h->tos),
2321+
.flowi4_oif = dev->ifindex,
2322+
.flowi4_proto = ip4h->protocol,
2323+
.daddr = ip4h->daddr,
2324+
.saddr = ip4h->saddr,
2325+
};
2326+
2327+
rt = ip_route_output_flow(net, &fl4, NULL);
2328+
if (IS_ERR(rt))
2329+
goto out_drop;
2330+
if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
2331+
ip_rt_put(rt);
2332+
goto out_drop;
2333+
}
2334+
2335+
skb_dst_set(skb, &rt->dst);
2336+
2337+
err = bpf_out_neigh_v4(net, skb);
2338+
if (unlikely(net_xmit_eval(err)))
2339+
dev->stats.tx_errors++;
2340+
else
2341+
ret = NET_XMIT_SUCCESS;
2342+
goto out_xmit;
2343+
out_drop:
2344+
dev->stats.tx_errors++;
2345+
kfree_skb(skb);
2346+
out_xmit:
2347+
return ret;
2348+
}
2349+
#else
2350+
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
2351+
{
2352+
kfree_skb(skb);
2353+
return NET_XMIT_DROP;
2354+
}
2355+
#endif /* CONFIG_INET */
2356+
2357+
static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
2358+
{
2359+
struct ethhdr *ethh = eth_hdr(skb);
2360+
2361+
if (unlikely(skb->mac_header >= skb->network_header))
2362+
goto out;
2363+
bpf_push_mac_rcsum(skb);
2364+
if (is_multicast_ether_addr(ethh->h_dest))
2365+
goto out;
2366+
2367+
skb_pull(skb, sizeof(*ethh));
2368+
skb_unset_mac_header(skb);
2369+
skb_reset_network_header(skb);
2370+
2371+
if (skb->protocol == htons(ETH_P_IP))
2372+
return __bpf_redirect_neigh_v4(skb, dev);
2373+
else if (skb->protocol == htons(ETH_P_IPV6))
2374+
return __bpf_redirect_neigh_v6(skb, dev);
2375+
out:
2376+
kfree_skb(skb);
2377+
return -ENOTSUPP;
2378+
}
2379+
2380+
/* Internal, non-exposed redirect flags. */
2381+
enum {
2382+
BPF_F_NEIGH = (1ULL << 1),
2383+
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH)
2384+
};
2385+
21662386
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
21672387
{
21682388
struct net_device *dev;
21692389
struct sk_buff *clone;
21702390
int ret;
21712391

2172-
if (unlikely(flags & ~(BPF_F_INGRESS)))
2392+
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
21732393
return -EINVAL;
21742394

21752395
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
@@ -2206,23 +2426,11 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
22062426
DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
22072427
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
22082428

2209-
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2210-
{
2211-
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2212-
2213-
if (unlikely(flags & ~(BPF_F_INGRESS)))
2214-
return TC_ACT_SHOT;
2215-
2216-
ri->flags = flags;
2217-
ri->tgt_index = ifindex;
2218-
2219-
return TC_ACT_REDIRECT;
2220-
}
2221-
22222429
int skb_do_redirect(struct sk_buff *skb)
22232430
{
22242431
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
22252432
struct net_device *dev;
2433+
u32 flags = ri->flags;
22262434

22272435
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
22282436
ri->tgt_index = 0;
@@ -2231,7 +2439,22 @@ int skb_do_redirect(struct sk_buff *skb)
22312439
return -EINVAL;
22322440
}
22332441

2234-
return __bpf_redirect(skb, dev, ri->flags);
2442+
return flags & BPF_F_NEIGH ?
2443+
__bpf_redirect_neigh(skb, dev) :
2444+
__bpf_redirect(skb, dev, flags);
2445+
}
2446+
2447+
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2448+
{
2449+
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2450+
2451+
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
2452+
return TC_ACT_SHOT;
2453+
2454+
ri->flags = flags;
2455+
ri->tgt_index = ifindex;
2456+
2457+
return TC_ACT_REDIRECT;
22352458
}
22362459

22372460
static const struct bpf_func_proto bpf_redirect_proto = {
@@ -2242,6 +2465,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
22422465
.arg2_type = ARG_ANYTHING,
22432466
};
22442467

2468+
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
2469+
{
2470+
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2471+
2472+
if (unlikely(flags))
2473+
return TC_ACT_SHOT;
2474+
2475+
ri->flags = BPF_F_NEIGH;
2476+
ri->tgt_index = ifindex;
2477+
2478+
return TC_ACT_REDIRECT;
2479+
}
2480+
2481+
static const struct bpf_func_proto bpf_redirect_neigh_proto = {
2482+
.func = bpf_redirect_neigh,
2483+
.gpl_only = false,
2484+
.ret_type = RET_INTEGER,
2485+
.arg1_type = ARG_ANYTHING,
2486+
.arg2_type = ARG_ANYTHING,
2487+
};
2488+
22452489
BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
22462490
{
22472491
msg->apply_bytes = bytes;
@@ -6759,6 +7003,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
67597003
return bpf_get_skb_set_tunnel_proto(func_id);
67607004
case BPF_FUNC_redirect:
67617005
return &bpf_redirect_proto;
7006+
case BPF_FUNC_redirect_neigh:
7007+
return &bpf_redirect_neigh_proto;
67627008
case BPF_FUNC_get_route_realm:
67637009
return &bpf_get_route_realm_proto;
67647010
case BPF_FUNC_get_hash_recalc:

tools/include/uapi/linux/bpf.h

+14
Original file line numberDiff line numberDiff line change
@@ -3652,6 +3652,19 @@ union bpf_attr {
36523652
* associated socket instead of the current process.
36533653
* Return
36543654
* The id is returned or 0 in case the id could not be retrieved.
3655+
*
3656+
* long bpf_redirect_neigh(u32 ifindex, u64 flags)
3657+
* Description
3658+
* Redirect the packet to another net device of index *ifindex*
3659+
* and fill in L2 addresses from neighboring subsystem. This helper
3660+
* is somewhat similar to **bpf_redirect**\ (), except that it
3661+
* fills in e.g. MAC addresses based on the L3 information from
3662+
* the packet. This helper is supported for IPv4 and IPv6 protocols.
3663+
* The *flags* argument is reserved and must be 0. The helper is
3664+
* currently only supported for tc BPF program types.
3665+
* Return
3666+
* The helper returns **TC_ACT_REDIRECT** on success or
3667+
* **TC_ACT_SHOT** on error.
36553668
*/
36563669
#define __BPF_FUNC_MAPPER(FN) \
36573670
FN(unspec), \
@@ -3806,6 +3819,7 @@ union bpf_attr {
38063819
FN(snprintf_btf), \
38073820
FN(seq_printf_btf), \
38083821
FN(skb_cgroup_classid), \
3822+
FN(redirect_neigh), \
38093823
/* */
38103824

38113825
/* integer value in 'imm' field of BPF_CALL instruction selects which helper

0 commit comments

Comments
 (0)