1
- From 660e63c0bbae1a7f58dadf04c1b7a9eef7621227 Mon Sep 17 00:00:00 2001
2
- From: Kiran Kella <kiran.kella @broadcom.com>
3
- Date: Tue, 5 Oct 2021 23:26:02 -0700
4
- Subject: [PATCH] netfilter: nf_nat: Support fullcone NAT
1
+ From d1dd893ddae49ca4dc55073449c37d5b97504c05 Mon Sep 17 00:00:00 2001
2
+ From: Akhilesh Samineni <akhilesh.samineni @broadcom.com>
3
+ Date: Mon, 6 Nov 2023 11:55:58 -0800
4
+ Subject: [PATCH] Support fullcone NAT
5
5
6
6
Changes done in the kernel to ensure 3-tuple uniqueness of the conntrack
7
7
entries for the fullcone nat functionality.
@@ -27,43 +27,42 @@ The kernel changes mentioned above are done to counter the challenges
27
27
explained in the section *3.4.2.1 Handling NAT model mismatch between
28
28
the ASIC and the Kernel* in the NAT HLD [1].
29
29
30
- [1]: https://github.com/kirankella /SONiC/blob/nat_doc_changes /doc/nat/nat_design_spec.md
30
+ [1]: https://github.com/sonic-net /SONiC/blob/master /doc/nat/nat_design_spec.md
31
31
32
- Signed-off-by: Kiran Kella <kiran.kella @broadcom.com>
32
+ Signed-off-by: Akhilesh Samineni <akhilesh.samineni @broadcom.com>
33
33
---
34
- include/net/netfilter/nf_conntrack.h | 3 +
35
- include/uapi/linux/netfilter/nf_nat.h | 4 +-
36
- net/netfilter/nf_nat_core.c | 204 ++++++++++++++++++++++- ---
37
- 3 files changed, 180 insertions(+), 31 deletions(-)
34
+ include/net/netfilter/nf_conntrack.h | 3 +
35
+ include/uapi/linux/netfilter/nf_nat.h | 3 +-
36
+ net/netfilter/nf_nat_core.c | 222 +++++++++++++++---
37
+ 3 files changed, 197 insertions(+), 31 deletions(-)
38
38
39
39
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
40
- index 439379ca9..c4c05b7b0 100644
40
+ index 6a2019aaa..191d6367c 100644
41
41
--- a/include/net/netfilter/nf_conntrack.h
42
42
+++ b/include/net/netfilter/nf_conntrack.h
43
- @@ -85 ,6 +85 ,9 @@ struct nf_conn {
43
+ @@ -103 ,6 +103 ,9 @@ struct nf_conn {
44
44
45
45
#if IS_ENABLED(CONFIG_NF_NAT)
46
46
struct hlist_node nat_bysource;
47
47
+
48
- + /* To optionally ensure 3-tuple uniqueness on the translated source */
49
- + struct hlist_node nat_by_manip_src;
48
+ + /* To optionally ensure 3-tuple uniqueness on the translated source */
49
+ + struct hlist_node nat_by_manip_src;
50
50
#endif
51
51
/* all members below initialized via memset */
52
52
struct { } __nfct_init_offset;
53
53
diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
54
- index a64586e77..9b3f48a7d 100644
54
+ index a64586e77..d60f5a9c2 100644
55
55
--- a/include/uapi/linux/netfilter/nf_nat.h
56
56
+++ b/include/uapi/linux/netfilter/nf_nat.h
57
- @@ -13,6 +13,8 @@
57
+ @@ -12,6 +12,7 @@
58
+ #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
58
59
#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5)
59
60
#define NF_NAT_RANGE_NETMAP (1 << 6)
61
+ + #define NF_NAT_RANGE_FULLCONE (1 << 10)
60
62
61
- + #define NF_NAT_RANGE_FULLCONE (1 << 10)
62
- +
63
63
#define NF_NAT_RANGE_PROTO_RANDOM_ALL \
64
64
(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
65
-
66
- @@ -20,7 +22,7 @@
65
+ @@ -20,7 +21,7 @@
67
66
(NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \
68
67
NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \
69
68
NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \
@@ -73,7 +72,7 @@ index a64586e77..9b3f48a7d 100644
73
72
struct nf_nat_ipv4_range {
74
73
unsigned int flags;
75
74
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
76
- index b7c3c9022..16cac0253 100644
75
+ index e29e4ccb5..678b50967 100644
77
76
--- a/net/netfilter/nf_nat_core.c
78
77
+++ b/net/netfilter/nf_nat_core.c
79
78
@@ -33,6 +33,7 @@ static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -82,41 +81,60 @@ index b7c3c9022..16cac0253 100644
82
81
static struct hlist_head *nf_nat_bysource __read_mostly;
83
82
+ static struct hlist_head *nf_nat_by_manip_src __read_mostly;
84
83
static unsigned int nf_nat_htable_size __read_mostly;
85
- static unsigned int nf_nat_hash_rnd __read_mostly ;
84
+ static siphash_aligned_key_t nf_nat_hash_rnd;
86
85
87
- @@ -200 ,6 +201,31 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
86
+ @@ -180 ,6 +181,50 @@ hash_by_src(const struct net *net,
88
87
return reciprocal_scale(hash, nf_nat_htable_size);
89
88
}
90
89
91
90
+ static inline unsigned int
92
- + hash_by_dst(const struct net *n, const struct nf_conntrack_tuple *tuple)
91
+ + hash_by_dst(const struct net *net,
92
+ + const struct nf_conntrack_zone *zone,
93
+ + const struct nf_conntrack_tuple *tuple)
93
94
+ {
94
- + unsigned int hash;
95
+ + unsigned int hash;
96
+ + struct {
97
+ + union nf_inet_addr dst_addr;
98
+ + u32 net_mix;
99
+ + u16 dport;
100
+ + u32 protonum;
101
+ + u32 zone;
102
+ + } __aligned(SIPHASH_ALIGNMENT) combined;
103
+ +
104
+ + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
105
+ +
106
+ + memset(&combined, 0, sizeof(combined));
107
+ +
108
+ + combined.dst_addr = tuple->dst.u3;
109
+ + combined.net_mix = net_hash_mix(net);
110
+ + combined.protonum = tuple->dst.protonum;
111
+ + combined.dport = (__force __u16)tuple->dst.u.all;
95
112
+
96
- + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
113
+ + /* Zone ID can be used provided its valid for both directions */
114
+ + if (zone->dir == NF_CT_DEFAULT_ZONE_DIR)
115
+ + combined.zone = zone->id;
97
116
+
98
- + hash = jhash2((u32 *)&tuple->dst, sizeof(tuple->dst) / sizeof(u32),
99
- + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
117
+ + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd);
100
118
+
101
- + return reciprocal_scale(hash, nf_nat_htable_size);
119
+ + return reciprocal_scale(hash, nf_nat_htable_size);
102
120
+ }
103
121
+
104
122
+ static inline int
105
123
+ same_reply_dst(const struct nf_conn *ct,
106
124
+ const struct nf_conntrack_tuple *tuple)
107
125
+ {
108
- + const struct nf_conntrack_tuple *t;
126
+ + const struct nf_conntrack_tuple *t;
109
127
+
110
- + t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
111
- + return (t->dst.protonum == tuple->dst.protonum &&
112
- + nf_inet_addr_cmp(&t->dst.u3, &tuple->dst.u3) &&
113
- + t->dst.u.all == tuple->dst.u.all);
128
+ + t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
129
+ + return (t->dst.protonum == tuple->dst.protonum &&
130
+ + nf_inet_addr_cmp(&t->dst.u3, &tuple->dst.u3) &&
131
+ + t->dst.u.all == tuple->dst.u.all);
114
132
+ }
115
133
+
116
134
/* Is this tuple already taken? (not by us) */
117
135
static int
118
136
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
119
- @@ -217 ,6 +243 ,38 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
137
+ @@ -197 ,6 +242 ,38 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
120
138
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
121
139
}
122
140
@@ -138,7 +156,7 @@ index b7c3c9022..16cac0253 100644
138
156
+ zone = nf_ct_zone(ignored_conntrack);
139
157
+
140
158
+ /* The tuple passed here is the inverted reply (with translated source) */
141
- + h = hash_by_src(net, tuple);
159
+ + h = hash_by_src(net, zone, tuple);
142
160
+ hlist_for_each_entry_rcu(ct, &nf_nat_by_manip_src[h], nat_by_manip_src) {
143
161
+ struct nf_conntrack_tuple reply;
144
162
+ nf_ct_invert_tuple(&reply, tuple);
@@ -155,7 +173,7 @@ index b7c3c9022..16cac0253 100644
155
173
static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
156
174
const struct nf_nat_range2 *range)
157
175
{
158
- @@ -318 ,6 +376,34 @@ find_appropriate_src(struct net *net,
176
+ @@ -298 ,6 +375,33 @@ find_appropriate_src(struct net *net,
159
177
return 0;
160
178
}
161
179
@@ -171,7 +189,7 @@ index b7c3c9022..16cac0253 100644
171
189
+ const struct nf_conn *ct;
172
190
+
173
191
+ nf_ct_invert_tuple(&reply, tuple);
174
- + h = hash_by_src(net, &reply);
192
+ + h = hash_by_src(net, zone, &reply);
175
193
+
176
194
+ hlist_for_each_entry_rcu(ct, &nf_nat_by_manip_src[h], nat_by_manip_src) {
177
195
+ if (same_reply_dst(ct, tuple) &&
@@ -186,11 +204,10 @@ index b7c3c9022..16cac0253 100644
186
204
+ }
187
205
+ return 0;
188
206
+ }
189
- +
190
207
/* For [FUTURE] fragmentation handling, we want the least-used
191
208
* src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
192
209
* if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
193
- @@ -397 ,10 +483 ,10 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
210
+ @@ -377 ,10 +481 ,10 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
194
211
*
195
212
* Per-protocol part of tuple is initialized to the incoming packet.
196
213
*/
@@ -205,7 +222,7 @@ index b7c3c9022..16cac0253 100644
205
222
{
206
223
unsigned int range_size, min, max, i, attempts;
207
224
__be16 *keyptr;
208
- @@ -426 ,7 +512 ,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
225
+ @@ -406 ,7 +510 ,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
209
226
/* If there is no master conntrack we are not PPTP,
210
227
do not change tuples */
211
228
if (!ct->master)
@@ -214,7 +231,7 @@ index b7c3c9022..16cac0253 100644
214
231
215
232
if (maniptype == NF_NAT_MANIP_SRC)
216
233
keyptr = &tuple->src.u.gre.key;
217
- @@ -454 ,14 +540 ,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
234
+ @@ -434 ,14 +538 ,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
218
235
219
236
break;
220
237
default:
@@ -231,7 +248,7 @@ index b7c3c9022..16cac0253 100644
231
248
232
249
if (ntohs(*keyptr) < 1024) {
233
250
/* Loose convention: >> 512 is credential passing */
234
- @@ -503 ,12 +589 ,18 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
251
+ @@ -483 ,12 +587 ,18 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
235
252
another_round:
236
253
for (i = 0; i < attempts; i++, off++) {
237
254
*keyptr = htons(min + off % range_size);
@@ -251,9 +268,9 @@ index b7c3c9022..16cac0253 100644
251
268
- return;
252
269
+ return 0;
253
270
attempts /= 2;
254
- off = prandom_u32 ();
271
+ off = get_random_u16 ();
255
272
goto another_round;
256
- @@ -517 ,10 +609 ,15 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
273
+ @@ -497 ,10 +607 ,15 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
257
274
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
258
275
* we change the source to map into the range. For NF_INET_PRE_ROUTING
259
276
* and NF_INET_LOCAL_OUT, we change the destination to map into the
@@ -272,7 +289,7 @@ index b7c3c9022..16cac0253 100644
272
289
get_unique_tuple(struct nf_conntrack_tuple *tuple,
273
290
const struct nf_conntrack_tuple *orig_tuple,
274
291
const struct nf_nat_range2 *range,
275
- @@ -528 ,8 +625 ,11 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
292
+ @@ -508 ,8 +623 ,11 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
276
293
enum nf_nat_manip_type maniptype)
277
294
{
278
295
const struct nf_conntrack_zone *zone;
@@ -284,12 +301,12 @@ index b7c3c9022..16cac0253 100644
284
301
zone = nf_ct_zone(ct);
285
302
286
303
/* 1) If this srcip/proto/src-proto-part is currently mapped,
287
- @@ -541 ,46 +641 ,76 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
304
+ @@ -521 ,46 +639 ,76 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
288
305
* manips not an issue.
289
306
*/
290
307
if (maniptype == NF_NAT_MANIP_SRC &&
291
308
- !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
292
- + !(nat_range.flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
309
+ + !(nat_range.flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
293
310
/* try the original tuple first */
294
311
- if (in_range(orig_tuple, range)) {
295
312
+ if (in_range(orig_tuple, &nat_range)) {
@@ -377,7 +394,7 @@ index b7c3c9022..16cac0253 100644
377
394
}
378
395
379
396
struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
380
- @@ -622 ,7 +752 ,9 @@ nf_nat_setup_info(struct nf_conn *ct,
397
+ @@ -602 ,7 +750 ,9 @@ nf_nat_setup_info(struct nf_conn *ct,
381
398
nf_ct_invert_tuple(&curr_tuple,
382
399
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
383
400
@@ -388,15 +405,15 @@ index b7c3c9022..16cac0253 100644
388
405
389
406
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
390
407
struct nf_conntrack_tuple reply;
391
- @@ -644 ,12 +776 ,16 @@ nf_nat_setup_info(struct nf_conn *ct,
408
+ @@ -624 ,12 +774 ,16 @@ nf_nat_setup_info(struct nf_conn *ct,
392
409
393
410
if (maniptype == NF_NAT_MANIP_SRC) {
394
411
unsigned int srchash;
395
412
+ unsigned int manip_src_hash;
396
413
spinlock_t *lock;
397
414
398
- + manip_src_hash = hash_by_src(net, &new_tuple);
399
- srchash = hash_by_src(net,
415
+ + manip_src_hash = hash_by_src(net, nf_ct_zone(ct), &new_tuple);
416
+ srchash = hash_by_src(net, nf_ct_zone(ct),
400
417
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
401
418
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
402
419
spin_lock_bh(lock);
@@ -405,38 +422,41 @@ index b7c3c9022..16cac0253 100644
405
422
hlist_add_head_rcu(&ct->nat_bysource,
406
423
&nf_nat_bysource[srchash]);
407
424
spin_unlock_bh(lock);
408
- @@ -818 ,6 +954 ,7 @@ static void __nf_nat_cleanup_conntrack (struct nf_conn *ct)
409
- h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
425
+ @@ -808 ,6 +962 ,7 @@ static void nf_nat_cleanup_conntrack (struct nf_conn *ct)
426
+ h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
410
427
spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
411
428
hlist_del_rcu(&ct->nat_bysource);
412
429
+ hlist_del_rcu(&ct->nat_by_manip_src);
413
430
spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
414
431
}
415
432
416
- @@ -1161,9 +1298,14 @@ static int __init nf_nat_init(void)
433
+ @@ -1138,12 +1293,17 @@ static int __init nf_nat_init(void)
417
434
if (!nf_nat_bysource)
418
435
return -ENOMEM;
419
436
420
437
+ nf_nat_by_manip_src = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
421
438
+ if (!nf_nat_by_manip_src)
422
439
+ return -ENOMEM;
423
440
+
424
- ret = nf_ct_extend_register(&nat_extend);
441
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
442
+ spin_lock_init(&nf_nat_locks[i]);
443
+
444
+ ret = register_pernet_subsys(&nat_net_ops);
425
445
if (ret < 0) {
426
446
kvfree(nf_nat_bysource);
427
447
+ kvfree(nf_nat_by_manip_src);
428
- pr_err("Unable to register extension\n");
429
448
return ret;
430
449
}
431
- @@ -1175,6 +1317,7 @@ static int __init nf_nat_init(void)
432
- if (ret < 0) {
433
- nf_ct_extend_unregister(&nat_extend);
450
+
451
+ @@ -1159,6 +1319,7 @@ static int __init nf_nat_init(void)
452
+ synchronize_net();
453
+ unregister_pernet_subsys(&nat_net_ops);
434
454
kvfree(nf_nat_bysource);
435
455
+ kvfree(nf_nat_by_manip_src);
436
- return ret;
437
456
}
438
457
439
- @@ -1198,6 +1341,7 @@ static void __exit nf_nat_cleanup(void)
458
+ return ret;
459
+ @@ -1175,6 +1336,7 @@ static void __exit nf_nat_cleanup(void)
440
460
441
461
synchronize_net();
442
462
kvfree(nf_nat_bysource);
@@ -445,5 +465,5 @@ index b7c3c9022..16cac0253 100644
445
465
}
446
466
447
467
- -
448
- 2.27 .0
468
+ 2.18 .0
449
469
0 commit comments