.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
---|
3 | 4 | * operating system. INET is implemented using the BSD Socket |
---|
.. | .. |
---|
37 | 38 | return 1; /* user timeout has passed; fire ASAP */ |
---|
38 | 39 | |
---|
39 | 40 | return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); |
---|
| 41 | +} |
---|
| 42 | + |
---|
| 43 | +u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) |
---|
| 44 | +{ |
---|
| 45 | + struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 46 | + u32 remaining; |
---|
| 47 | + s32 elapsed; |
---|
| 48 | + |
---|
| 49 | + if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) |
---|
| 50 | + return when; |
---|
| 51 | + |
---|
| 52 | + elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; |
---|
| 53 | + if (unlikely(elapsed < 0)) |
---|
| 54 | + elapsed = 0; |
---|
| 55 | + remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; |
---|
| 56 | + remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); |
---|
| 57 | + |
---|
| 58 | + return min_t(u32, remaining, when); |
---|
40 | 59 | } |
---|
41 | 60 | |
---|
42 | 61 | /** |
---|
.. | .. |
---|
124 | 143 | */ |
---|
125 | 144 | static int tcp_orphan_retries(struct sock *sk, bool alive) |
---|
126 | 145 | { |
---|
127 | | - int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ |
---|
| 146 | + int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */ |
---|
128 | 147 | |
---|
129 | 148 | /* We know from an ICMP that something is wrong. */ |
---|
130 | 149 | if (sk->sk_err_soft && !alive) |
---|
.. | .. |
---|
144 | 163 | int mss; |
---|
145 | 164 | |
---|
146 | 165 | /* Black hole detection */ |
---|
147 | | - if (!net->ipv4.sysctl_tcp_mtu_probing) |
---|
| 166 | + if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing)) |
---|
148 | 167 | return; |
---|
149 | 168 | |
---|
150 | 169 | if (!icsk->icsk_mtup.enabled) { |
---|
.. | .. |
---|
152 | 171 | icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; |
---|
153 | 172 | } else { |
---|
154 | 173 | mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; |
---|
155 | | - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); |
---|
156 | | - mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); |
---|
157 | | - mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); |
---|
| 174 | + mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); |
---|
| 175 | + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor)); |
---|
| 176 | + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); |
---|
158 | 177 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); |
---|
159 | 178 | } |
---|
160 | 179 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
---|
.. | .. |
---|
218 | 237 | int retry_until; |
---|
219 | 238 | |
---|
220 | 239 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
---|
221 | | - if (icsk->icsk_retransmits) { |
---|
222 | | - dst_negative_advice(sk); |
---|
223 | | - } else { |
---|
224 | | - sk_rethink_txhash(sk); |
---|
225 | | - } |
---|
| 240 | + if (icsk->icsk_retransmits) |
---|
| 241 | + __dst_negative_advice(sk); |
---|
226 | 242 | retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; |
---|
227 | 243 | expired = icsk->icsk_retransmits >= retry_until; |
---|
228 | 244 | } else { |
---|
229 | | - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { |
---|
| 245 | + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { |
---|
230 | 246 | /* Black hole detection */ |
---|
231 | 247 | tcp_mtu_probing(icsk, sk); |
---|
232 | 248 | |
---|
233 | | - dst_negative_advice(sk); |
---|
234 | | - } else { |
---|
235 | | - sk_rethink_txhash(sk); |
---|
| 249 | + __dst_negative_advice(sk); |
---|
236 | 250 | } |
---|
237 | 251 | |
---|
238 | | - retry_until = net->ipv4.sysctl_tcp_retries2; |
---|
| 252 | + retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); |
---|
239 | 253 | if (sock_flag(sk, SOCK_DEAD)) { |
---|
240 | 254 | const bool alive = icsk->icsk_rto < TCP_RTO_MAX; |
---|
241 | 255 | |
---|
.. | .. |
---|
263 | 277 | return 1; |
---|
264 | 278 | } |
---|
265 | 279 | |
---|
| 280 | + if (sk_rethink_txhash(sk)) { |
---|
| 281 | + tp->timeout_rehash++; |
---|
| 282 | + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); |
---|
| 283 | + } |
---|
| 284 | + |
---|
266 | 285 | return 0; |
---|
267 | 286 | } |
---|
268 | 287 | |
---|
.. | .. |
---|
284 | 303 | icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; |
---|
285 | 304 | |
---|
286 | 305 | if (inet_csk_ack_scheduled(sk)) { |
---|
287 | | - if (!icsk->icsk_ack.pingpong) { |
---|
| 306 | + if (!inet_csk_in_pingpong_mode(sk)) { |
---|
288 | 307 | /* Delayed ACK missed: inflate ATO. */ |
---|
289 | 308 | icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); |
---|
290 | 309 | } else { |
---|
291 | 310 | /* Delayed ACK missed: leave pingpong mode and |
---|
292 | 311 | * deflate ATO. |
---|
293 | 312 | */ |
---|
294 | | - icsk->icsk_ack.pingpong = 0; |
---|
| 313 | + inet_csk_exit_pingpong_mode(sk); |
---|
295 | 314 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
---|
296 | 315 | } |
---|
297 | 316 | tcp_mstamp_refresh(tcp_sk(sk)); |
---|
.. | .. |
---|
307 | 326 | |
---|
308 | 327 | /** |
---|
309 | 328 | * tcp_delack_timer() - The TCP delayed ACK timeout handler |
---|
310 | | - * @data: Pointer to the current socket. (gets casted to struct sock *) |
---|
| 329 | + * @t: Pointer to the timer. (gets casted to struct sock *) |
---|
311 | 330 | * |
---|
312 | 331 | * This function gets (indirectly) called when the kernel timer for a TCP packet |
---|
313 | 332 | * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. |
---|
.. | .. |
---|
324 | 343 | if (!sock_owned_by_user(sk)) { |
---|
325 | 344 | tcp_delack_timer_handler(sk); |
---|
326 | 345 | } else { |
---|
327 | | - icsk->icsk_ack.blocked = 1; |
---|
328 | 346 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); |
---|
329 | 347 | /* deleguate our work to tcp_release_cb() */ |
---|
330 | 348 | if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) |
---|
.. | .. |
---|
340 | 358 | struct sk_buff *skb = tcp_send_head(sk); |
---|
341 | 359 | struct tcp_sock *tp = tcp_sk(sk); |
---|
342 | 360 | int max_probes; |
---|
343 | | - u32 start_ts; |
---|
344 | 361 | |
---|
345 | 362 | if (tp->packets_out || !skb) { |
---|
346 | 363 | icsk->icsk_probes_out = 0; |
---|
| 364 | + icsk->icsk_probes_tstamp = 0; |
---|
347 | 365 | return; |
---|
348 | 366 | } |
---|
349 | 367 | |
---|
.. | .. |
---|
355 | 373 | * corresponding system limit. We also implement similar policy when |
---|
356 | 374 | * we use RTO to probe window in tcp_retransmit_timer(). |
---|
357 | 375 | */ |
---|
358 | | - start_ts = tcp_skb_timestamp(skb); |
---|
359 | | - if (!start_ts) |
---|
360 | | - skb->skb_mstamp = tp->tcp_mstamp; |
---|
| 376 | + if (!icsk->icsk_probes_tstamp) |
---|
| 377 | + icsk->icsk_probes_tstamp = tcp_jiffies32; |
---|
361 | 378 | else if (icsk->icsk_user_timeout && |
---|
362 | | - (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) |
---|
| 379 | + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= |
---|
| 380 | + msecs_to_jiffies(icsk->icsk_user_timeout)) |
---|
363 | 381 | goto abort; |
---|
364 | 382 | |
---|
365 | | - max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; |
---|
| 383 | + max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); |
---|
366 | 384 | if (sock_flag(sk, SOCK_DEAD)) { |
---|
367 | 385 | const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; |
---|
368 | 386 | |
---|
.. | .. |
---|
385 | 403 | * Timer for Fast Open socket to retransmit SYNACK. Note that the |
---|
386 | 404 | * sk here is the child socket, not the parent (listener) socket. |
---|
387 | 405 | */ |
---|
388 | | -static void tcp_fastopen_synack_timer(struct sock *sk) |
---|
| 406 | +static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) |
---|
389 | 407 | { |
---|
390 | 408 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
391 | 409 | int max_retries = icsk->icsk_syn_retries ? : |
---|
392 | 410 | sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ |
---|
393 | | - struct request_sock *req; |
---|
| 411 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
394 | 412 | |
---|
395 | | - req = tcp_sk(sk)->fastopen_rsk; |
---|
396 | 413 | req->rsk_ops->syn_ack_timeout(req); |
---|
397 | 414 | |
---|
398 | 415 | if (req->num_timeout >= max_retries) { |
---|
399 | 416 | tcp_write_err(sk); |
---|
400 | 417 | return; |
---|
401 | 418 | } |
---|
| 419 | + /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ |
---|
| 420 | + if (icsk->icsk_retransmits == 1) |
---|
| 421 | + tcp_enter_loss(sk); |
---|
402 | 422 | /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error |
---|
403 | 423 | * returned from rtx_syn_ack() to make it more persistent like |
---|
404 | 424 | * regular retransmit because if the child socket has been accepted |
---|
.. | .. |
---|
407 | 427 | inet_rtx_syn_ack(sk, req); |
---|
408 | 428 | req->num_timeout++; |
---|
409 | 429 | icsk->icsk_retransmits++; |
---|
| 430 | + if (!tp->retrans_stamp) |
---|
| 431 | + tp->retrans_stamp = tcp_time_stamp(tp); |
---|
410 | 432 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
---|
411 | 433 | TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); |
---|
412 | 434 | } |
---|
413 | 435 | |
---|
| 436 | +static bool tcp_rtx_probe0_timed_out(const struct sock *sk, |
---|
| 437 | + const struct sk_buff *skb) |
---|
| 438 | +{ |
---|
| 439 | + const struct tcp_sock *tp = tcp_sk(sk); |
---|
| 440 | + const int timeout = TCP_RTO_MAX * 2; |
---|
| 441 | + u32 rcv_delta, rtx_delta; |
---|
| 442 | + |
---|
| 443 | + rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; |
---|
| 444 | + if (rcv_delta <= timeout) |
---|
| 445 | + return false; |
---|
| 446 | + |
---|
| 447 | + rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - |
---|
| 448 | + (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); |
---|
| 449 | + |
---|
| 450 | + return rtx_delta > timeout; |
---|
| 451 | +} |
---|
414 | 452 | |
---|
415 | 453 | /** |
---|
416 | 454 | * tcp_retransmit_timer() - The TCP retransmit timeout handler |
---|
.. | .. |
---|
428 | 466 | struct tcp_sock *tp = tcp_sk(sk); |
---|
429 | 467 | struct net *net = sock_net(sk); |
---|
430 | 468 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 469 | + struct request_sock *req; |
---|
| 470 | + struct sk_buff *skb; |
---|
431 | 471 | |
---|
432 | | - if (tp->fastopen_rsk) { |
---|
| 472 | + req = rcu_dereference_protected(tp->fastopen_rsk, |
---|
| 473 | + lockdep_sock_is_held(sk)); |
---|
| 474 | + if (req) { |
---|
433 | 475 | WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && |
---|
434 | 476 | sk->sk_state != TCP_FIN_WAIT1); |
---|
435 | | - tcp_fastopen_synack_timer(sk); |
---|
| 477 | + tcp_fastopen_synack_timer(sk, req); |
---|
436 | 478 | /* Before we receive ACK to our SYN-ACK don't retransmit |
---|
437 | 479 | * anything else (e.g., data or FIN segments). |
---|
438 | 480 | */ |
---|
439 | 481 | return; |
---|
440 | 482 | } |
---|
441 | | - if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk))) |
---|
| 483 | + |
---|
| 484 | + if (!tp->packets_out) |
---|
| 485 | + return; |
---|
| 486 | + |
---|
| 487 | + skb = tcp_rtx_queue_head(sk); |
---|
| 488 | + if (WARN_ON_ONCE(!skb)) |
---|
442 | 489 | return; |
---|
443 | 490 | |
---|
444 | 491 | tp->tlp_high_seq = 0; |
---|
.. | .. |
---|
467 | 514 | tp->snd_una, tp->snd_nxt); |
---|
468 | 515 | } |
---|
469 | 516 | #endif |
---|
470 | | - if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { |
---|
| 517 | + if (tcp_rtx_probe0_timed_out(sk, skb)) { |
---|
471 | 518 | tcp_write_err(sk); |
---|
472 | 519 | goto out; |
---|
473 | 520 | } |
---|
474 | 521 | tcp_enter_loss(sk); |
---|
475 | | - tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); |
---|
| 522 | + tcp_retransmit_skb(sk, skb, 1); |
---|
476 | 523 | __sk_dst_reset(sk); |
---|
477 | 524 | goto out_reset_timer; |
---|
478 | 525 | } |
---|
.. | .. |
---|
504 | 551 | |
---|
505 | 552 | tcp_enter_loss(sk); |
---|
506 | 553 | |
---|
| 554 | + icsk->icsk_retransmits++; |
---|
507 | 555 | if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { |
---|
508 | 556 | /* Retransmission failed because of local congestion, |
---|
509 | | - * do not backoff. |
---|
| 557 | + * Let senders fight for local resources conservatively. |
---|
510 | 558 | */ |
---|
511 | | - if (!icsk->icsk_retransmits) |
---|
512 | | - icsk->icsk_retransmits = 1; |
---|
513 | 559 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
---|
514 | | - min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), |
---|
| 560 | + TCP_RESOURCE_PROBE_INTERVAL, |
---|
515 | 561 | TCP_RTO_MAX); |
---|
516 | 562 | goto out; |
---|
517 | 563 | } |
---|
.. | .. |
---|
532 | 578 | * the 120 second clamps though! |
---|
533 | 579 | */ |
---|
534 | 580 | icsk->icsk_backoff++; |
---|
535 | | - icsk->icsk_retransmits++; |
---|
536 | 581 | |
---|
537 | 582 | out_reset_timer: |
---|
538 | 583 | /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is |
---|
.. | .. |
---|
545 | 590 | * linear-timeout retransmissions into a black hole |
---|
546 | 591 | */ |
---|
547 | 592 | if (sk->sk_state == TCP_ESTABLISHED && |
---|
548 | | - (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && |
---|
| 593 | + (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) && |
---|
549 | 594 | tcp_stream_is_thin(tp) && |
---|
550 | 595 | icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { |
---|
551 | 596 | icsk->icsk_backoff = 0; |
---|
552 | | - icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); |
---|
| 597 | + icsk->icsk_rto = clamp(__tcp_set_rto(tp), |
---|
| 598 | + tcp_rto_min(sk), |
---|
| 599 | + TCP_RTO_MAX); |
---|
553 | 600 | } else { |
---|
554 | 601 | /* Use normal (exponential) backoff */ |
---|
555 | 602 | icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
---|
556 | 603 | } |
---|
557 | 604 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
---|
558 | 605 | tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); |
---|
559 | | - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) |
---|
| 606 | + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) |
---|
560 | 607 | __sk_dst_reset(sk); |
---|
561 | 608 | |
---|
562 | 609 | out:; |
---|
.. | .. |
---|
735 | 782 | |
---|
736 | 783 | bh_lock_sock(sk); |
---|
737 | 784 | if (!sock_owned_by_user(sk)) { |
---|
738 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
---|
| 785 | + if (tp->compressed_ack) { |
---|
| 786 | + /* Since we have to send one ack finally, |
---|
| 787 | + * substract one from tp->compressed_ack to keep |
---|
| 788 | + * LINUX_MIB_TCPACKCOMPRESSED accurate. |
---|
| 789 | + */ |
---|
| 790 | + tp->compressed_ack--; |
---|
739 | 791 | tcp_send_ack(sk); |
---|
| 792 | + } |
---|
740 | 793 | } else { |
---|
741 | 794 | if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, |
---|
742 | 795 | &sk->sk_tsq_flags)) |
---|