hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/net/ipv4/tcp_metrics.c
....@@ -40,7 +40,7 @@
4040
4141 struct tcp_metrics_block {
4242 struct tcp_metrics_block __rcu *tcpm_next;
43
- possible_net_t tcpm_net;
43
+ struct net *tcpm_net;
4444 struct inetpeer_addr tcpm_saddr;
4545 struct inetpeer_addr tcpm_daddr;
4646 unsigned long tcpm_stamp;
....@@ -51,34 +51,38 @@
5151 struct rcu_head rcu_head;
5252 };
5353
54
-static inline struct net *tm_net(struct tcp_metrics_block *tm)
54
+static inline struct net *tm_net(const struct tcp_metrics_block *tm)
5555 {
56
- return read_pnet(&tm->tcpm_net);
56
+ /* Paired with the WRITE_ONCE() in tcpm_new() */
57
+ return READ_ONCE(tm->tcpm_net);
5758 }
5859
5960 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
6061 enum tcp_metric_index idx)
6162 {
62
- return tm->tcpm_lock & (1 << idx);
63
+ /* Paired with WRITE_ONCE() in tcpm_suck_dst() */
64
+ return READ_ONCE(tm->tcpm_lock) & (1 << idx);
6365 }
6466
65
-static u32 tcp_metric_get(struct tcp_metrics_block *tm,
67
+static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
6668 enum tcp_metric_index idx)
6769 {
68
- return tm->tcpm_vals[idx];
70
+ /* Paired with WRITE_ONCE() in tcp_metric_set() */
71
+ return READ_ONCE(tm->tcpm_vals[idx]);
6972 }
7073
7174 static void tcp_metric_set(struct tcp_metrics_block *tm,
7275 enum tcp_metric_index idx,
7376 u32 val)
7477 {
75
- tm->tcpm_vals[idx] = val;
78
+ /* Paired with READ_ONCE() in tcp_metric_get() */
79
+ WRITE_ONCE(tm->tcpm_vals[idx], val);
7680 }
7781
7882 static bool addr_same(const struct inetpeer_addr *a,
7983 const struct inetpeer_addr *b)
8084 {
81
- return inetpeer_addr_cmp(a, b) == 0;
85
+ return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
8286 }
8387
8488 struct tcpm_hash_bucket {
....@@ -89,6 +93,7 @@
8993 static unsigned int tcp_metrics_hash_log __read_mostly;
9094
9195 static DEFINE_SPINLOCK(tcp_metrics_lock);
96
+static DEFINE_SEQLOCK(fastopen_seqlock);
9297
9398 static void tcpm_suck_dst(struct tcp_metrics_block *tm,
9499 const struct dst_entry *dst,
....@@ -97,7 +102,7 @@
97102 u32 msval;
98103 u32 val;
99104
100
- tm->tcpm_stamp = jiffies;
105
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
101106
102107 val = 0;
103108 if (dst_metric_locked(dst, RTAX_RTT))
....@@ -110,30 +115,42 @@
110115 val |= 1 << TCP_METRIC_CWND;
111116 if (dst_metric_locked(dst, RTAX_REORDERING))
112117 val |= 1 << TCP_METRIC_REORDERING;
113
- tm->tcpm_lock = val;
118
+ /* Paired with READ_ONCE() in tcp_metric_locked() */
119
+ WRITE_ONCE(tm->tcpm_lock, val);
114120
115121 msval = dst_metric_raw(dst, RTAX_RTT);
116
- tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
122
+ tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);
117123
118124 msval = dst_metric_raw(dst, RTAX_RTTVAR);
119
- tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
120
- tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
121
- tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
122
- tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
125
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
126
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
127
+ dst_metric_raw(dst, RTAX_SSTHRESH));
128
+ tcp_metric_set(tm, TCP_METRIC_CWND,
129
+ dst_metric_raw(dst, RTAX_CWND));
130
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
131
+ dst_metric_raw(dst, RTAX_REORDERING));
123132 if (fastopen_clear) {
133
+ write_seqlock(&fastopen_seqlock);
124134 tm->tcpm_fastopen.mss = 0;
125135 tm->tcpm_fastopen.syn_loss = 0;
126136 tm->tcpm_fastopen.try_exp = 0;
127137 tm->tcpm_fastopen.cookie.exp = false;
128138 tm->tcpm_fastopen.cookie.len = 0;
139
+ write_sequnlock(&fastopen_seqlock);
129140 }
130141 }
131142
132143 #define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
133144
134
-static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
145
+static void tcpm_check_stamp(struct tcp_metrics_block *tm,
146
+ const struct dst_entry *dst)
135147 {
136
- if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
148
+ unsigned long limit;
149
+
150
+ if (!tm)
151
+ return;
152
+ limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
153
+ if (unlikely(time_after(jiffies, limit)))
137154 tcpm_suck_dst(tm, dst, false);
138155 }
139156
....@@ -174,20 +191,23 @@
174191 oldest = deref_locked(tcp_metrics_hash[hash].chain);
175192 for (tm = deref_locked(oldest->tcpm_next); tm;
176193 tm = deref_locked(tm->tcpm_next)) {
177
- if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
194
+ if (time_before(READ_ONCE(tm->tcpm_stamp),
195
+ READ_ONCE(oldest->tcpm_stamp)))
178196 oldest = tm;
179197 }
180198 tm = oldest;
181199 } else {
182
- tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
200
+ tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
183201 if (!tm)
184202 goto out_unlock;
185203 }
186
- write_pnet(&tm->tcpm_net, net);
204
+ /* Paired with the READ_ONCE() in tm_net() */
205
+ WRITE_ONCE(tm->tcpm_net, net);
206
+
187207 tm->tcpm_saddr = *saddr;
188208 tm->tcpm_daddr = *daddr;
189209
190
- tcpm_suck_dst(tm, dst, true);
210
+ tcpm_suck_dst(tm, dst, reclaim);
191211
192212 if (likely(!reclaim)) {
193213 tm->tcpm_next = tcp_metrics_hash[hash].chain;
....@@ -329,7 +349,7 @@
329349 int m;
330350
331351 sk_dst_confirm(sk);
332
- if (net->ipv4.sysctl_tcp_nometrics_save || !dst)
352
+ if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst)
333353 return;
334354
335355 rcu_read_lock();
....@@ -385,7 +405,8 @@
385405
386406 if (tcp_in_initial_slowstart(tp)) {
387407 /* Slow start still did not finish. */
388
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
408
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
409
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
389410 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
390411 if (val && (tp->snd_cwnd >> 1) > val)
391412 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
....@@ -400,7 +421,8 @@
400421 } else if (!tcp_in_slow_start(tp) &&
401422 icsk->icsk_ca_state == TCP_CA_Open) {
402423 /* Cong. avoidance phase, cwnd is reliable. */
403
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
424
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
425
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
404426 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
405427 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
406428 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
....@@ -416,7 +438,8 @@
416438 tcp_metric_set(tm, TCP_METRIC_CWND,
417439 (val + tp->snd_ssthresh) >> 1);
418440 }
419
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
441
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
442
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
420443 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
421444 if (val && tp->snd_ssthresh > val)
422445 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
....@@ -425,12 +448,13 @@
425448 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
426449 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
427450 if (val < tp->reordering &&
428
- tp->reordering != net->ipv4.sysctl_tcp_reordering)
451
+ tp->reordering !=
452
+ READ_ONCE(net->ipv4.sysctl_tcp_reordering))
429453 tcp_metric_set(tm, TCP_METRIC_REORDERING,
430454 tp->reordering);
431455 }
432456 }
433
- tm->tcpm_stamp = jiffies;
457
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
434458 out_unlock:
435459 rcu_read_unlock();
436460 }
....@@ -441,6 +465,7 @@
441465 {
442466 struct dst_entry *dst = __sk_dst_get(sk);
443467 struct tcp_sock *tp = tcp_sk(sk);
468
+ struct net *net = sock_net(sk);
444469 struct tcp_metrics_block *tm;
445470 u32 val, crtt = 0; /* cached RTT scaled by 8 */
446471
....@@ -458,7 +483,8 @@
458483 if (tcp_metric_locked(tm, TCP_METRIC_CWND))
459484 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
460485
461
- val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
486
+ val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
487
+ 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
462488 if (val) {
463489 tp->snd_ssthresh = val;
464490 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
....@@ -512,16 +538,6 @@
512538
513539 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
514540 }
515
- /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
516
- * retransmitted. In light of RFC6298 more aggressive 1sec
517
- * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
518
- * retransmission has occurred.
519
- */
520
- if (tp->total_retrans > 1)
521
- tp->snd_cwnd = 1;
522
- else
523
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
524
- tp->snd_cwnd_stamp = tcp_jiffies32;
525541 }
526542
527543 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
....@@ -542,8 +558,6 @@
542558
543559 return ret;
544560 }
545
-
546
-static DEFINE_SEQLOCK(fastopen_seqlock);
547561
548562 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
549563 struct tcp_fastopen_cookie *cookie)
....@@ -651,18 +665,18 @@
651665 }
652666
653667 if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
654
- jiffies - tm->tcpm_stamp,
668
+ jiffies - READ_ONCE(tm->tcpm_stamp),
655669 TCP_METRICS_ATTR_PAD) < 0)
656670 goto nla_put_failure;
657671
658672 {
659673 int n = 0;
660674
661
- nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
675
+ nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS);
662676 if (!nest)
663677 goto nla_put_failure;
664678 for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
665
- u32 val = tm->tcpm_vals[i];
679
+ u32 val = tcp_metric_get(tm, i);
666680
667681 if (!val)
668682 continue;
....@@ -948,17 +962,17 @@
948962 return 0;
949963 }
950964
951
-static const struct genl_ops tcp_metrics_nl_ops[] = {
965
+static const struct genl_small_ops tcp_metrics_nl_ops[] = {
952966 {
953967 .cmd = TCP_METRICS_CMD_GET,
968
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
954969 .doit = tcp_metrics_nl_cmd_get,
955970 .dumpit = tcp_metrics_nl_dump,
956
- .policy = tcp_metrics_nl_policy,
957971 },
958972 {
959973 .cmd = TCP_METRICS_CMD_DEL,
974
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
960975 .doit = tcp_metrics_nl_cmd_del,
961
- .policy = tcp_metrics_nl_policy,
962976 .flags = GENL_ADMIN_PERM,
963977 },
964978 };
....@@ -968,10 +982,11 @@
968982 .name = TCP_METRICS_GENL_NAME,
969983 .version = TCP_METRICS_GENL_VERSION,
970984 .maxattr = TCP_METRICS_ATTR_MAX,
985
+ .policy = tcp_metrics_nl_policy,
971986 .netnsok = true,
972987 .module = THIS_MODULE,
973
- .ops = tcp_metrics_nl_ops,
974
- .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
988
+ .small_ops = tcp_metrics_nl_ops,
989
+ .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
975990 };
976991
977992 static unsigned int tcpmhash_entries;
....@@ -1000,7 +1015,7 @@
10001015
10011016 slots = tcpmhash_entries;
10021017 if (!slots) {
1003
- if (totalram_pages >= 128 * 1024)
1018
+ if (totalram_pages() >= 128 * 1024)
10041019 slots = 16 * 1024;
10051020 else
10061021 slots = 8 * 1024;