hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/net/ipv4/tcp_timer.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -37,6 +38,24 @@
3738 return 1; /* user timeout has passed; fire ASAP */
3839
3940 return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
41
+}
42
+
43
+u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
44
+{
45
+ struct inet_connection_sock *icsk = inet_csk(sk);
46
+ u32 remaining;
47
+ s32 elapsed;
48
+
49
+ if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp)
50
+ return when;
51
+
52
+ elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
53
+ if (unlikely(elapsed < 0))
54
+ elapsed = 0;
55
+ remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed;
56
+ remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
57
+
58
+ return min_t(u32, remaining, when);
4059 }
4160
4261 /**
....@@ -124,7 +143,7 @@
124143 */
125144 static int tcp_orphan_retries(struct sock *sk, bool alive)
126145 {
127
- int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
146
+ int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */
128147
129148 /* We know from an ICMP that something is wrong. */
130149 if (sk->sk_err_soft && !alive)
....@@ -144,7 +163,7 @@
144163 int mss;
145164
146165 /* Black hole detection */
147
- if (!net->ipv4.sysctl_tcp_mtu_probing)
166
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing))
148167 return;
149168
150169 if (!icsk->icsk_mtup.enabled) {
....@@ -152,9 +171,9 @@
152171 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
153172 } else {
154173 mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
155
- mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
156
- mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
157
- mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
174
+ mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss);
175
+ mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor));
176
+ mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss));
158177 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
159178 }
160179 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
....@@ -218,24 +237,19 @@
218237 int retry_until;
219238
220239 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
221
- if (icsk->icsk_retransmits) {
222
- dst_negative_advice(sk);
223
- } else {
224
- sk_rethink_txhash(sk);
225
- }
240
+ if (icsk->icsk_retransmits)
241
+ __dst_negative_advice(sk);
226242 retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
227243 expired = icsk->icsk_retransmits >= retry_until;
228244 } else {
229
- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
245
+ if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
230246 /* Black hole detection */
231247 tcp_mtu_probing(icsk, sk);
232248
233
- dst_negative_advice(sk);
234
- } else {
235
- sk_rethink_txhash(sk);
249
+ __dst_negative_advice(sk);
236250 }
237251
238
- retry_until = net->ipv4.sysctl_tcp_retries2;
252
+ retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
239253 if (sock_flag(sk, SOCK_DEAD)) {
240254 const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
241255
....@@ -263,6 +277,11 @@
263277 return 1;
264278 }
265279
280
+ if (sk_rethink_txhash(sk)) {
281
+ tp->timeout_rehash++;
282
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
283
+ }
284
+
266285 return 0;
267286 }
268287
....@@ -284,14 +303,14 @@
284303 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
285304
286305 if (inet_csk_ack_scheduled(sk)) {
287
- if (!icsk->icsk_ack.pingpong) {
306
+ if (!inet_csk_in_pingpong_mode(sk)) {
288307 /* Delayed ACK missed: inflate ATO. */
289308 icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
290309 } else {
291310 /* Delayed ACK missed: leave pingpong mode and
292311 * deflate ATO.
293312 */
294
- icsk->icsk_ack.pingpong = 0;
313
+ inet_csk_exit_pingpong_mode(sk);
295314 icsk->icsk_ack.ato = TCP_ATO_MIN;
296315 }
297316 tcp_mstamp_refresh(tcp_sk(sk));
....@@ -307,7 +326,7 @@
307326
308327 /**
309328 * tcp_delack_timer() - The TCP delayed ACK timeout handler
310
- * @data: Pointer to the current socket. (gets casted to struct sock *)
329
+ * @t: Pointer to the timer. (gets casted to struct sock *)
311330 *
312331 * This function gets (indirectly) called when the kernel timer for a TCP packet
313332 * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work.
....@@ -324,7 +343,6 @@
324343 if (!sock_owned_by_user(sk)) {
325344 tcp_delack_timer_handler(sk);
326345 } else {
327
- icsk->icsk_ack.blocked = 1;
328346 __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
329347 /* deleguate our work to tcp_release_cb() */
330348 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
....@@ -340,10 +358,10 @@
340358 struct sk_buff *skb = tcp_send_head(sk);
341359 struct tcp_sock *tp = tcp_sk(sk);
342360 int max_probes;
343
- u32 start_ts;
344361
345362 if (tp->packets_out || !skb) {
346363 icsk->icsk_probes_out = 0;
364
+ icsk->icsk_probes_tstamp = 0;
347365 return;
348366 }
349367
....@@ -355,14 +373,14 @@
355373 * corresponding system limit. We also implement similar policy when
356374 * we use RTO to probe window in tcp_retransmit_timer().
357375 */
358
- start_ts = tcp_skb_timestamp(skb);
359
- if (!start_ts)
360
- skb->skb_mstamp = tp->tcp_mstamp;
376
+ if (!icsk->icsk_probes_tstamp)
377
+ icsk->icsk_probes_tstamp = tcp_jiffies32;
361378 else if (icsk->icsk_user_timeout &&
362
- (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
379
+ (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
380
+ msecs_to_jiffies(icsk->icsk_user_timeout))
363381 goto abort;
364382
365
- max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
383
+ max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
366384 if (sock_flag(sk, SOCK_DEAD)) {
367385 const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
368386
....@@ -385,20 +403,22 @@
385403 * Timer for Fast Open socket to retransmit SYNACK. Note that the
386404 * sk here is the child socket, not the parent (listener) socket.
387405 */
388
-static void tcp_fastopen_synack_timer(struct sock *sk)
406
+static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
389407 {
390408 struct inet_connection_sock *icsk = inet_csk(sk);
391409 int max_retries = icsk->icsk_syn_retries ? :
392410 sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
393
- struct request_sock *req;
411
+ struct tcp_sock *tp = tcp_sk(sk);
394412
395
- req = tcp_sk(sk)->fastopen_rsk;
396413 req->rsk_ops->syn_ack_timeout(req);
397414
398415 if (req->num_timeout >= max_retries) {
399416 tcp_write_err(sk);
400417 return;
401418 }
419
+ /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
420
+ if (icsk->icsk_retransmits == 1)
421
+ tcp_enter_loss(sk);
402422 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
403423 * returned from rtx_syn_ack() to make it more persistent like
404424 * regular retransmit because if the child socket has been accepted
....@@ -407,10 +427,28 @@
407427 inet_rtx_syn_ack(sk, req);
408428 req->num_timeout++;
409429 icsk->icsk_retransmits++;
430
+ if (!tp->retrans_stamp)
431
+ tp->retrans_stamp = tcp_time_stamp(tp);
410432 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
411433 TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
412434 }
413435
436
+static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
437
+ const struct sk_buff *skb)
438
+{
439
+ const struct tcp_sock *tp = tcp_sk(sk);
440
+ const int timeout = TCP_RTO_MAX * 2;
441
+ u32 rcv_delta, rtx_delta;
442
+
443
+ rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
444
+ if (rcv_delta <= timeout)
445
+ return false;
446
+
447
+ rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) -
448
+ (tp->retrans_stamp ?: tcp_skb_timestamp(skb)));
449
+
450
+ return rtx_delta > timeout;
451
+}
414452
415453 /**
416454 * tcp_retransmit_timer() - The TCP retransmit timeout handler
....@@ -428,17 +466,26 @@
428466 struct tcp_sock *tp = tcp_sk(sk);
429467 struct net *net = sock_net(sk);
430468 struct inet_connection_sock *icsk = inet_csk(sk);
469
+ struct request_sock *req;
470
+ struct sk_buff *skb;
431471
432
- if (tp->fastopen_rsk) {
472
+ req = rcu_dereference_protected(tp->fastopen_rsk,
473
+ lockdep_sock_is_held(sk));
474
+ if (req) {
433475 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
434476 sk->sk_state != TCP_FIN_WAIT1);
435
- tcp_fastopen_synack_timer(sk);
477
+ tcp_fastopen_synack_timer(sk, req);
436478 /* Before we receive ACK to our SYN-ACK don't retransmit
437479 * anything else (e.g., data or FIN segments).
438480 */
439481 return;
440482 }
441
- if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk)))
483
+
484
+ if (!tp->packets_out)
485
+ return;
486
+
487
+ skb = tcp_rtx_queue_head(sk);
488
+ if (WARN_ON_ONCE(!skb))
442489 return;
443490
444491 tp->tlp_high_seq = 0;
....@@ -467,12 +514,12 @@
467514 tp->snd_una, tp->snd_nxt);
468515 }
469516 #endif
470
- if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) {
517
+ if (tcp_rtx_probe0_timed_out(sk, skb)) {
471518 tcp_write_err(sk);
472519 goto out;
473520 }
474521 tcp_enter_loss(sk);
475
- tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
522
+ tcp_retransmit_skb(sk, skb, 1);
476523 __sk_dst_reset(sk);
477524 goto out_reset_timer;
478525 }
....@@ -504,14 +551,13 @@
504551
505552 tcp_enter_loss(sk);
506553
554
+ icsk->icsk_retransmits++;
507555 if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
508556 /* Retransmission failed because of local congestion,
509
- * do not backoff.
557
+ * Let senders fight for local resources conservatively.
510558 */
511
- if (!icsk->icsk_retransmits)
512
- icsk->icsk_retransmits = 1;
513559 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
514
- min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
560
+ TCP_RESOURCE_PROBE_INTERVAL,
515561 TCP_RTO_MAX);
516562 goto out;
517563 }
....@@ -532,7 +578,6 @@
532578 * the 120 second clamps though!
533579 */
534580 icsk->icsk_backoff++;
535
- icsk->icsk_retransmits++;
536581
537582 out_reset_timer:
538583 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
....@@ -545,18 +590,20 @@
545590 * linear-timeout retransmissions into a black hole
546591 */
547592 if (sk->sk_state == TCP_ESTABLISHED &&
548
- (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) &&
593
+ (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) &&
549594 tcp_stream_is_thin(tp) &&
550595 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
551596 icsk->icsk_backoff = 0;
552
- icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
597
+ icsk->icsk_rto = clamp(__tcp_set_rto(tp),
598
+ tcp_rto_min(sk),
599
+ TCP_RTO_MAX);
553600 } else {
554601 /* Use normal (exponential) backoff */
555602 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
556603 }
557604 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
558605 tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
559
- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
606
+ if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0))
560607 __sk_dst_reset(sk);
561608
562609 out:;
....@@ -735,8 +782,14 @@
735782
736783 bh_lock_sock(sk);
737784 if (!sock_owned_by_user(sk)) {
738
- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
785
+ if (tp->compressed_ack) {
786
+ /* Since we have to send one ack finally,
787
+ * substract one from tp->compressed_ack to keep
788
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
789
+ */
790
+ tp->compressed_ack--;
739791 tcp_send_ack(sk);
792
+ }
740793 } else {
741794 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
742795 &sk->sk_tsq_flags))