|
| 1 | +From 58be55eb5de3a319c0740ba8071440d972c821c3 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Harish Venkatraman < [email protected]> |
| 3 | +Date: Tue, 25 Sep 2018 09:56:25 -0700 |
| 4 | +Subject: [PATCH] net: udp_l3mdev_accept support |
| 5 | + |
| 6 | +From 63a6fff353d01da5a22b72670c434bf12fa0e3b8 Mon Sep 17 00:00:00 2001 |
| 7 | +From: Robert Shearman < [email protected]> |
| 8 | +Date: Thu, 26 Jan 2017 18:02:24 +0000 |
| 9 | +Subject: [PATCH] net: Avoid receiving packets with an l3mdev on unbound UDP |
| 10 | + sockets |
| 11 | + |
| 12 | +Packets arriving in a VRF currently are delivered to UDP sockets that |
| 13 | +aren't bound to any interface. TCP defaults to not delivering packets |
| 14 | +arriving in a VRF to unbound sockets. IP route lookup and socket |
| 15 | +transmit both assume that unbound means using the default table and |
| 16 | +UDP applications that haven't been changed to be aware of VRFs may not |
| 17 | +function correctly in this case since they may not be able to handle |
| 18 | +overlapping IP address ranges, or be able to send packets back to the |
| 19 | +original sender if required. |
| 20 | + |
| 21 | +So add a sysctl, udp_l3mdev_accept, to control this behaviour with it |
| 22 | +being analgous to the existing tcp_l3mdev_accept, namely to allow a |
| 23 | +process to have a VRF-global listen socket. Have this default to off |
| 24 | +as this is the behaviour that users will expect, given that there is |
| 25 | +no explicit mechanism to set unmodified VRF-unaware application into a |
| 26 | +default VRF. |
| 27 | + |
| 28 | +Signed-off-by: Robert Shearman < [email protected]> |
| 29 | +Acked-by: David Ahern < [email protected]> |
| 30 | +Tested-by: David Ahern < [email protected]> |
| 31 | +Signed-off-by: David S. Miller < [email protected]> |
| 32 | + |
| 33 | +--- |
| 34 | + Documentation/networking/ip-sysctl.txt | 7 +++++++ |
| 35 | + Documentation/networking/vrf.txt | 7 ++++--- |
| 36 | + include/net/netns/ipv4.h | 4 ++++ |
| 37 | + net/ipv4/sysctl_net_ipv4.c | 11 +++++++++++ |
| 38 | + net/ipv4/udp.c | 27 ++++++++++++++++++++------- |
| 39 | + net/ipv6/udp.c | 27 ++++++++++++++++++++------- |
| 40 | + 6 files changed, 66 insertions(+), 17 deletions(-) |
| 41 | + |
| 42 | +diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt |
| 43 | +index 3db8c67..7ceeff3 100644 |
| 44 | +--- a/Documentation/networking/ip-sysctl.txt |
| 45 | ++++ b/Documentation/networking/ip-sysctl.txt |
| 46 | +@@ -737,6 +737,13 @@ tcp_challenge_ack_limit - INTEGER |
| 47 | + |
| 48 | + UDP variables: |
| 49 | + |
| 50 | ++udp_l3mdev_accept - BOOLEAN |
| 51 | ++ Enabling this option allows a "global" bound socket to work |
| 52 | ++ across L3 master domains (e.g., VRFs) with packets capable of |
| 53 | ++ being received regardless of the L3 domain in which they |
| 54 | ++ originated. Only valid when the kernel was compiled with |
| 55 | ++ CONFIG_NET_L3_MASTER_DEV. |
| 56 | ++ |
| 57 | + udp_mem - vector of 3 INTEGERs: min, pressure, max |
| 58 | + Number of pages allowed for queueing by all UDP sockets. |
| 59 | + |
| 60 | +diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt |
| 61 | +index 755dab8..3918dae 100644 |
| 62 | +--- a/Documentation/networking/vrf.txt |
| 63 | ++++ b/Documentation/networking/vrf.txt |
| 64 | +@@ -98,10 +98,11 @@ VRF device: |
| 65 | + |
| 66 | + or to specify the output device using cmsg and IP_PKTINFO. |
| 67 | + |
| 68 | +-TCP services running in the default VRF context (ie., not bound to any VRF |
| 69 | +-device) can work across all VRF domains by enabling the tcp_l3mdev_accept |
| 70 | +-sysctl option: |
| 71 | ++TCP & UDP services running in the default VRF context (ie., not bound |
| 72 | ++to any VRF device) can work across all VRF domains by enabling the |
| 73 | ++tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: |
| 74 | + sysctl -w net.ipv4.tcp_l3mdev_accept=1 |
| 75 | ++ sysctl -w net.ipv4.udp_l3mdev_accept=1 |
| 76 | + |
| 77 | + netfilter rules on the VRF device can be used to limit access to services |
| 78 | + running in the default VRF context as well. |
| 79 | +diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h |
| 80 | +index 7adf438..3917764 100644 |
| 81 | +--- a/include/net/netns/ipv4.h |
| 82 | ++++ b/include/net/netns/ipv4.h |
| 83 | +@@ -111,6 +111,10 @@ struct netns_ipv4 { |
| 84 | + int sysctl_tcp_fin_timeout; |
| 85 | + unsigned int sysctl_tcp_notsent_lowat; |
| 86 | + |
| 87 | ++#ifdef CONFIG_NET_L3_MASTER_DEV |
| 88 | ++ int sysctl_udp_l3mdev_accept; |
| 89 | ++#endif |
| 90 | ++ |
| 91 | + int sysctl_igmp_max_memberships; |
| 92 | + int sysctl_igmp_max_msf; |
| 93 | + int sysctl_igmp_llm_reports; |
| 94 | +diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c |
| 95 | +index 566cfc5..2006032 100644 |
| 96 | +--- a/net/ipv4/sysctl_net_ipv4.c |
| 97 | ++++ b/net/ipv4/sysctl_net_ipv4.c |
| 98 | +@@ -971,6 +971,17 @@ static struct ctl_table ipv4_net_table[] = { |
| 99 | + .extra2 = &one, |
| 100 | + }, |
| 101 | + #endif |
| 102 | ++#ifdef CONFIG_NET_L3_MASTER_DEV |
| 103 | ++ { |
| 104 | ++ .procname = "udp_l3mdev_accept", |
| 105 | ++ .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, |
| 106 | ++ .maxlen = sizeof(int), |
| 107 | ++ .mode = 0644, |
| 108 | ++ .proc_handler = proc_dointvec_minmax, |
| 109 | ++ .extra1 = &zero, |
| 110 | ++ .extra2 = &one, |
| 111 | ++ }, |
| 112 | ++#endif |
| 113 | + { } |
| 114 | + }; |
| 115 | + |
| 116 | +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c |
| 117 | +index aa2a20e..7229028 100644 |
| 118 | +--- a/net/ipv4/udp.c |
| 119 | ++++ b/net/ipv4/udp.c |
| 120 | +@@ -134,6 +134,17 @@ EXPORT_SYMBOL(udp_memory_allocated); |
| 121 | + #define MAX_UDP_PORTS 65536 |
| 122 | + #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) |
| 123 | + |
| 124 | ++/* IPCB reference means this can not be used from early demux */ |
| 125 | ++static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb) |
| 126 | ++{ |
| 127 | ++#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) |
| 128 | ++ if (!net->ipv4.sysctl_udp_l3mdev_accept && |
| 129 | ++ skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) |
| 130 | ++ return true; |
| 131 | ++#endif |
| 132 | ++ return false; |
| 133 | ++} |
| 134 | ++ |
| 135 | + static int udp_lib_lport_inuse(struct net *net, __u16 num, |
| 136 | + const struct udp_hslot *hslot, |
| 137 | + unsigned long *bitmap, |
| 138 | +@@ -391,7 +402,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) |
| 139 | + |
| 140 | + static int compute_score(struct sock *sk, struct net *net, |
| 141 | + __be32 saddr, __be16 sport, |
| 142 | +- __be32 daddr, unsigned short hnum, int dif) |
| 143 | ++ __be32 daddr, unsigned short hnum, int dif, |
| 144 | ++ bool exact_dif) |
| 145 | + { |
| 146 | + int score; |
| 147 | + struct inet_sock *inet; |
| 148 | +@@ -422,7 +434,7 @@ static int compute_score(struct sock *sk, struct net *net, |
| 149 | + score += 4; |
| 150 | + } |
| 151 | + |
| 152 | +- if (sk->sk_bound_dev_if) { |
| 153 | ++ if (sk->sk_bound_dev_if || exact_dif) { |
| 154 | + if (sk->sk_bound_dev_if != dif) |
| 155 | + return -1; |
| 156 | + score += 4; |
| 157 | +@@ -447,7 +459,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, |
| 158 | + /* called with rcu_read_lock() */ |
| 159 | + static struct sock *udp4_lib_lookup2(struct net *net, |
| 160 | + __be32 saddr, __be16 sport, |
| 161 | +- __be32 daddr, unsigned int hnum, int dif, |
| 162 | ++ __be32 daddr, unsigned int hnum, int dif, bool exact_dif, |
| 163 | + struct udp_hslot *hslot2, |
| 164 | + struct sk_buff *skb) |
| 165 | + { |
| 166 | +@@ -459,7 +471,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, |
| 167 | + badness = 0; |
| 168 | + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { |
| 169 | + score = compute_score(sk, net, saddr, sport, |
| 170 | +- daddr, hnum, dif); |
| 171 | ++ daddr, hnum, dif, exact_dif); |
| 172 | + if (score > badness) { |
| 173 | + reuseport = sk->sk_reuseport; |
| 174 | + if (reuseport) { |
| 175 | +@@ -494,6 +506,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, |
| 176 | + unsigned short hnum = ntohs(dport); |
| 177 | + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); |
| 178 | + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; |
| 179 | ++ bool exact_dif = udp_lib_exact_dif_match(net, skb); |
| 180 | + int score, badness, matches = 0, reuseport = 0; |
| 181 | + u32 hash = 0; |
| 182 | + |
| 183 | +@@ -506,7 +519,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, |
| 184 | + |
| 185 | + result = udp4_lib_lookup2(net, saddr, sport, |
| 186 | + daddr, hnum, dif, |
| 187 | +- hslot2, skb); |
| 188 | ++ exact_dif, hslot2, skb); |
| 189 | + if (!result) { |
| 190 | + unsigned int old_slot2 = slot2; |
| 191 | + hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); |
| 192 | +@@ -521,7 +534,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, |
| 193 | + |
| 194 | + result = udp4_lib_lookup2(net, saddr, sport, |
| 195 | + daddr, hnum, dif, |
| 196 | +- hslot2, skb); |
| 197 | ++ exact_dif, hslot2, skb); |
| 198 | + } |
| 199 | + return result; |
| 200 | + } |
| 201 | +@@ -530,7 +543,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, |
| 202 | + badness = 0; |
| 203 | + sk_for_each_rcu(sk, &hslot->head) { |
| 204 | + score = compute_score(sk, net, saddr, sport, |
| 205 | +- daddr, hnum, dif); |
| 206 | ++ daddr, hnum, dif, exact_dif); |
| 207 | + if (score > badness) { |
| 208 | + reuseport = sk->sk_reuseport; |
| 209 | + if (reuseport) { |
| 210 | +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c |
| 211 | +index 4db5f54..1317d2f 100644 |
| 212 | +--- a/net/ipv6/udp.c |
| 213 | ++++ b/net/ipv6/udp.c |
| 214 | +@@ -55,6 +55,16 @@ |
| 215 | + #include <trace/events/skb.h> |
| 216 | + #include "udp_impl.h" |
| 217 | + |
| 218 | ++static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) |
| 219 | ++{ |
| 220 | ++#if defined(CONFIG_NET_L3_MASTER_DEV) |
| 221 | ++ if (!net->ipv4.sysctl_udp_l3mdev_accept && |
| 222 | ++ skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) |
| 223 | ++ return true; |
| 224 | ++#endif |
| 225 | ++ return false; |
| 226 | ++} |
| 227 | ++ |
| 228 | + static u32 udp6_ehashfn(const struct net *net, |
| 229 | + const struct in6_addr *laddr, |
| 230 | + const u16 lport, |
| 231 | +@@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk) |
| 232 | + static int compute_score(struct sock *sk, struct net *net, |
| 233 | + const struct in6_addr *saddr, __be16 sport, |
| 234 | + const struct in6_addr *daddr, unsigned short hnum, |
| 235 | +- int dif) |
| 236 | ++ int dif, bool exact_dif) |
| 237 | + { |
| 238 | + int score; |
| 239 | + struct inet_sock *inet; |
| 240 | +@@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net, |
| 241 | + score++; |
| 242 | + } |
| 243 | + |
| 244 | +- if (sk->sk_bound_dev_if) { |
| 245 | ++ if (sk->sk_bound_dev_if || exact_dif) { |
| 246 | + if (sk->sk_bound_dev_if != dif) |
| 247 | + return -1; |
| 248 | + score++; |
| 249 | +@@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net, |
| 250 | + static struct sock *udp6_lib_lookup2(struct net *net, |
| 251 | + const struct in6_addr *saddr, __be16 sport, |
| 252 | + const struct in6_addr *daddr, unsigned int hnum, int dif, |
| 253 | +- struct udp_hslot *hslot2, |
| 254 | ++ bool exact_dif, struct udp_hslot *hslot2, |
| 255 | + struct sk_buff *skb) |
| 256 | + { |
| 257 | + struct sock *sk, *result; |
| 258 | +@@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, |
| 259 | + badness = -1; |
| 260 | + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { |
| 261 | + score = compute_score(sk, net, saddr, sport, |
| 262 | +- daddr, hnum, dif); |
| 263 | ++ daddr, hnum, dif, exact_dif); |
| 264 | + if (score > badness) { |
| 265 | + reuseport = sk->sk_reuseport; |
| 266 | + if (reuseport) { |
| 267 | +@@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net, |
| 268 | + unsigned short hnum = ntohs(dport); |
| 269 | + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); |
| 270 | + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; |
| 271 | ++ bool exact_dif = udp6_lib_exact_dif_match(net, skb); |
| 272 | + int score, badness, matches = 0, reuseport = 0; |
| 273 | + u32 hash = 0; |
| 274 | + |
| 275 | +@@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net, |
| 276 | + goto begin; |
| 277 | + |
| 278 | + result = udp6_lib_lookup2(net, saddr, sport, |
| 279 | +- daddr, hnum, dif, |
| 280 | ++ daddr, hnum, dif, exact_dif, |
| 281 | + hslot2, skb); |
| 282 | + if (!result) { |
| 283 | + unsigned int old_slot2 = slot2; |
| 284 | +@@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net, |
| 285 | + |
| 286 | + result = udp6_lib_lookup2(net, saddr, sport, |
| 287 | + daddr, hnum, dif, |
| 288 | +- hslot2, skb); |
| 289 | ++ exact_dif, hslot2, |
| 290 | ++ skb); |
| 291 | + } |
| 292 | + return result; |
| 293 | + } |
| 294 | +@@ -247,7 +259,8 @@ struct sock *__udp6_lib_lookup(struct net *net, |
| 295 | + result = NULL; |
| 296 | + badness = -1; |
| 297 | + sk_for_each_rcu(sk, &hslot->head) { |
| 298 | +- score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); |
| 299 | ++ score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, |
| 300 | ++ exact_dif); |
| 301 | + if (score > badness) { |
| 302 | + reuseport = sk->sk_reuseport; |
| 303 | + if (reuseport) { |
| 304 | +-- |
| 305 | +2.7.4 |
| 306 | + |
0 commit comments