| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
|---|
| 3 | 4 | * operating system. INET is implemented using the BSD Socket |
|---|
| .. | .. |
|---|
| 37 | 38 | return 1; /* user timeout has passed; fire ASAP */ |
|---|
| 38 | 39 | |
|---|
| 39 | 40 | return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); |
|---|
| 41 | +} |
|---|
| 42 | + |
|---|
| 43 | +u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) |
|---|
| 44 | +{ |
|---|
| 45 | + struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 46 | + u32 remaining; |
|---|
| 47 | + s32 elapsed; |
|---|
| 48 | + |
|---|
| 49 | + if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) |
|---|
| 50 | + return when; |
|---|
| 51 | + |
|---|
| 52 | + elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; |
|---|
| 53 | + if (unlikely(elapsed < 0)) |
|---|
| 54 | + elapsed = 0; |
|---|
| 55 | + remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; |
|---|
| 56 | + remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); |
|---|
| 57 | + |
|---|
| 58 | + return min_t(u32, remaining, when); |
|---|
| 40 | 59 | } |
|---|
| 41 | 60 | |
|---|
| 42 | 61 | /** |
|---|
| .. | .. |
|---|
| 124 | 143 | */ |
|---|
| 125 | 144 | static int tcp_orphan_retries(struct sock *sk, bool alive) |
|---|
| 126 | 145 | { |
|---|
| 127 | | - int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ |
|---|
| 146 | + int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */ |
|---|
| 128 | 147 | |
|---|
| 129 | 148 | /* We know from an ICMP that something is wrong. */ |
|---|
| 130 | 149 | if (sk->sk_err_soft && !alive) |
|---|
| .. | .. |
|---|
| 144 | 163 | int mss; |
|---|
| 145 | 164 | |
|---|
| 146 | 165 | /* Black hole detection */ |
|---|
| 147 | | - if (!net->ipv4.sysctl_tcp_mtu_probing) |
|---|
| 166 | + if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing)) |
|---|
| 148 | 167 | return; |
|---|
| 149 | 168 | |
|---|
| 150 | 169 | if (!icsk->icsk_mtup.enabled) { |
|---|
| .. | .. |
|---|
| 152 | 171 | icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; |
|---|
| 153 | 172 | } else { |
|---|
| 154 | 173 | mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; |
|---|
| 155 | | - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); |
|---|
| 156 | | - mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); |
|---|
| 157 | | - mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); |
|---|
| 174 | + mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); |
|---|
| 175 | + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor)); |
|---|
| 176 | + mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); |
|---|
| 158 | 177 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); |
|---|
| 159 | 178 | } |
|---|
| 160 | 179 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
|---|
| .. | .. |
|---|
| 218 | 237 | int retry_until; |
|---|
| 219 | 238 | |
|---|
| 220 | 239 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
|---|
| 221 | | - if (icsk->icsk_retransmits) { |
|---|
| 222 | | - dst_negative_advice(sk); |
|---|
| 223 | | - } else { |
|---|
| 224 | | - sk_rethink_txhash(sk); |
|---|
| 225 | | - } |
|---|
| 240 | + if (icsk->icsk_retransmits) |
|---|
| 241 | + __dst_negative_advice(sk); |
|---|
| 226 | 242 | retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; |
|---|
| 227 | 243 | expired = icsk->icsk_retransmits >= retry_until; |
|---|
| 228 | 244 | } else { |
|---|
| 229 | | - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { |
|---|
| 245 | + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { |
|---|
| 230 | 246 | /* Black hole detection */ |
|---|
| 231 | 247 | tcp_mtu_probing(icsk, sk); |
|---|
| 232 | 248 | |
|---|
| 233 | | - dst_negative_advice(sk); |
|---|
| 234 | | - } else { |
|---|
| 235 | | - sk_rethink_txhash(sk); |
|---|
| 249 | + __dst_negative_advice(sk); |
|---|
| 236 | 250 | } |
|---|
| 237 | 251 | |
|---|
| 238 | | - retry_until = net->ipv4.sysctl_tcp_retries2; |
|---|
| 252 | + retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); |
|---|
| 239 | 253 | if (sock_flag(sk, SOCK_DEAD)) { |
|---|
| 240 | 254 | const bool alive = icsk->icsk_rto < TCP_RTO_MAX; |
|---|
| 241 | 255 | |
|---|
| .. | .. |
|---|
| 263 | 277 | return 1; |
|---|
| 264 | 278 | } |
|---|
| 265 | 279 | |
|---|
| 280 | + if (sk_rethink_txhash(sk)) { |
|---|
| 281 | + tp->timeout_rehash++; |
|---|
| 282 | + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); |
|---|
| 283 | + } |
|---|
| 284 | + |
|---|
| 266 | 285 | return 0; |
|---|
| 267 | 286 | } |
|---|
| 268 | 287 | |
|---|
| .. | .. |
|---|
| 284 | 303 | icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; |
|---|
| 285 | 304 | |
|---|
| 286 | 305 | if (inet_csk_ack_scheduled(sk)) { |
|---|
| 287 | | - if (!icsk->icsk_ack.pingpong) { |
|---|
| 306 | + if (!inet_csk_in_pingpong_mode(sk)) { |
|---|
| 288 | 307 | /* Delayed ACK missed: inflate ATO. */ |
|---|
| 289 | 308 | icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); |
|---|
| 290 | 309 | } else { |
|---|
| 291 | 310 | /* Delayed ACK missed: leave pingpong mode and |
|---|
| 292 | 311 | * deflate ATO. |
|---|
| 293 | 312 | */ |
|---|
| 294 | | - icsk->icsk_ack.pingpong = 0; |
|---|
| 313 | + inet_csk_exit_pingpong_mode(sk); |
|---|
| 295 | 314 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
|---|
| 296 | 315 | } |
|---|
| 297 | 316 | tcp_mstamp_refresh(tcp_sk(sk)); |
|---|
| .. | .. |
|---|
| 307 | 326 | |
|---|
| 308 | 327 | /** |
|---|
| 309 | 328 | * tcp_delack_timer() - The TCP delayed ACK timeout handler |
|---|
| 310 | | - * @data: Pointer to the current socket. (gets casted to struct sock *) |
|---|
| 329 | + * @t: Pointer to the timer. (gets casted to struct sock *) |
|---|
| 311 | 330 | * |
|---|
| 312 | 331 | * This function gets (indirectly) called when the kernel timer for a TCP packet |
|---|
| 313 | 332 | * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. |
|---|
| .. | .. |
|---|
| 324 | 343 | if (!sock_owned_by_user(sk)) { |
|---|
| 325 | 344 | tcp_delack_timer_handler(sk); |
|---|
| 326 | 345 | } else { |
|---|
| 327 | | - icsk->icsk_ack.blocked = 1; |
|---|
| 328 | 346 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); |
|---|
| 329 | 347 | /* deleguate our work to tcp_release_cb() */ |
|---|
| 330 | 348 | if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) |
|---|
| .. | .. |
|---|
| 340 | 358 | struct sk_buff *skb = tcp_send_head(sk); |
|---|
| 341 | 359 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 342 | 360 | int max_probes; |
|---|
| 343 | | - u32 start_ts; |
|---|
| 344 | 361 | |
|---|
| 345 | 362 | if (tp->packets_out || !skb) { |
|---|
| 346 | 363 | icsk->icsk_probes_out = 0; |
|---|
| 364 | + icsk->icsk_probes_tstamp = 0; |
|---|
| 347 | 365 | return; |
|---|
| 348 | 366 | } |
|---|
| 349 | 367 | |
|---|
| .. | .. |
|---|
| 355 | 373 | * corresponding system limit. We also implement similar policy when |
|---|
| 356 | 374 | * we use RTO to probe window in tcp_retransmit_timer(). |
|---|
| 357 | 375 | */ |
|---|
| 358 | | - start_ts = tcp_skb_timestamp(skb); |
|---|
| 359 | | - if (!start_ts) |
|---|
| 360 | | - skb->skb_mstamp = tp->tcp_mstamp; |
|---|
| 376 | + if (!icsk->icsk_probes_tstamp) |
|---|
| 377 | + icsk->icsk_probes_tstamp = tcp_jiffies32; |
|---|
| 361 | 378 | else if (icsk->icsk_user_timeout && |
|---|
| 362 | | - (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) |
|---|
| 379 | + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= |
|---|
| 380 | + msecs_to_jiffies(icsk->icsk_user_timeout)) |
|---|
| 363 | 381 | goto abort; |
|---|
| 364 | 382 | |
|---|
| 365 | | - max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; |
|---|
| 383 | + max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); |
|---|
| 366 | 384 | if (sock_flag(sk, SOCK_DEAD)) { |
|---|
| 367 | 385 | const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; |
|---|
| 368 | 386 | |
|---|
| .. | .. |
|---|
| 385 | 403 | * Timer for Fast Open socket to retransmit SYNACK. Note that the |
|---|
| 386 | 404 | * sk here is the child socket, not the parent (listener) socket. |
|---|
| 387 | 405 | */ |
|---|
| 388 | | -static void tcp_fastopen_synack_timer(struct sock *sk) |
|---|
| 406 | +static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) |
|---|
| 389 | 407 | { |
|---|
| 390 | 408 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 391 | 409 | int max_retries = icsk->icsk_syn_retries ? : |
|---|
| 392 | 410 | sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ |
|---|
| 393 | | - struct request_sock *req; |
|---|
| 411 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 394 | 412 | |
|---|
| 395 | | - req = tcp_sk(sk)->fastopen_rsk; |
|---|
| 396 | 413 | req->rsk_ops->syn_ack_timeout(req); |
|---|
| 397 | 414 | |
|---|
| 398 | 415 | if (req->num_timeout >= max_retries) { |
|---|
| 399 | 416 | tcp_write_err(sk); |
|---|
| 400 | 417 | return; |
|---|
| 401 | 418 | } |
|---|
| 419 | + /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ |
|---|
| 420 | + if (icsk->icsk_retransmits == 1) |
|---|
| 421 | + tcp_enter_loss(sk); |
|---|
| 402 | 422 | /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error |
|---|
| 403 | 423 | * returned from rtx_syn_ack() to make it more persistent like |
|---|
| 404 | 424 | * regular retransmit because if the child socket has been accepted |
|---|
| .. | .. |
|---|
| 407 | 427 | inet_rtx_syn_ack(sk, req); |
|---|
| 408 | 428 | req->num_timeout++; |
|---|
| 409 | 429 | icsk->icsk_retransmits++; |
|---|
| 430 | + if (!tp->retrans_stamp) |
|---|
| 431 | + tp->retrans_stamp = tcp_time_stamp(tp); |
|---|
| 410 | 432 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
|---|
| 411 | 433 | TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); |
|---|
| 412 | 434 | } |
|---|
| 413 | 435 | |
|---|
| 436 | +static bool tcp_rtx_probe0_timed_out(const struct sock *sk, |
|---|
| 437 | + const struct sk_buff *skb) |
|---|
| 438 | +{ |
|---|
| 439 | + const struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 440 | + const int timeout = TCP_RTO_MAX * 2; |
|---|
| 441 | + u32 rcv_delta, rtx_delta; |
|---|
| 442 | + |
|---|
| 443 | + rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; |
|---|
| 444 | + if (rcv_delta <= timeout) |
|---|
| 445 | + return false; |
|---|
| 446 | + |
|---|
| 447 | + rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - |
|---|
| 448 | + (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); |
|---|
| 449 | + |
|---|
| 450 | + return rtx_delta > timeout; |
|---|
| 451 | +} |
|---|
| 414 | 452 | |
|---|
| 415 | 453 | /** |
|---|
| 416 | 454 | * tcp_retransmit_timer() - The TCP retransmit timeout handler |
|---|
| .. | .. |
|---|
| 428 | 466 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 429 | 467 | struct net *net = sock_net(sk); |
|---|
| 430 | 468 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 469 | + struct request_sock *req; |
|---|
| 470 | + struct sk_buff *skb; |
|---|
| 431 | 471 | |
|---|
| 432 | | - if (tp->fastopen_rsk) { |
|---|
| 472 | + req = rcu_dereference_protected(tp->fastopen_rsk, |
|---|
| 473 | + lockdep_sock_is_held(sk)); |
|---|
| 474 | + if (req) { |
|---|
| 433 | 475 | WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && |
|---|
| 434 | 476 | sk->sk_state != TCP_FIN_WAIT1); |
|---|
| 435 | | - tcp_fastopen_synack_timer(sk); |
|---|
| 477 | + tcp_fastopen_synack_timer(sk, req); |
|---|
| 436 | 478 | /* Before we receive ACK to our SYN-ACK don't retransmit |
|---|
| 437 | 479 | * anything else (e.g., data or FIN segments). |
|---|
| 438 | 480 | */ |
|---|
| 439 | 481 | return; |
|---|
| 440 | 482 | } |
|---|
| 441 | | - if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk))) |
|---|
| 483 | + |
|---|
| 484 | + if (!tp->packets_out) |
|---|
| 485 | + return; |
|---|
| 486 | + |
|---|
| 487 | + skb = tcp_rtx_queue_head(sk); |
|---|
| 488 | + if (WARN_ON_ONCE(!skb)) |
|---|
| 442 | 489 | return; |
|---|
| 443 | 490 | |
|---|
| 444 | 491 | tp->tlp_high_seq = 0; |
|---|
| .. | .. |
|---|
| 467 | 514 | tp->snd_una, tp->snd_nxt); |
|---|
| 468 | 515 | } |
|---|
| 469 | 516 | #endif |
|---|
| 470 | | - if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { |
|---|
| 517 | + if (tcp_rtx_probe0_timed_out(sk, skb)) { |
|---|
| 471 | 518 | tcp_write_err(sk); |
|---|
| 472 | 519 | goto out; |
|---|
| 473 | 520 | } |
|---|
| 474 | 521 | tcp_enter_loss(sk); |
|---|
| 475 | | - tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); |
|---|
| 522 | + tcp_retransmit_skb(sk, skb, 1); |
|---|
| 476 | 523 | __sk_dst_reset(sk); |
|---|
| 477 | 524 | goto out_reset_timer; |
|---|
| 478 | 525 | } |
|---|
| .. | .. |
|---|
| 504 | 551 | |
|---|
| 505 | 552 | tcp_enter_loss(sk); |
|---|
| 506 | 553 | |
|---|
| 554 | + icsk->icsk_retransmits++; |
|---|
| 507 | 555 | if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { |
|---|
| 508 | 556 | /* Retransmission failed because of local congestion, |
|---|
| 509 | | - * do not backoff. |
|---|
| 557 | + * Let senders fight for local resources conservatively. |
|---|
| 510 | 558 | */ |
|---|
| 511 | | - if (!icsk->icsk_retransmits) |
|---|
| 512 | | - icsk->icsk_retransmits = 1; |
|---|
| 513 | 559 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
|---|
| 514 | | - min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), |
|---|
| 560 | + TCP_RESOURCE_PROBE_INTERVAL, |
|---|
| 515 | 561 | TCP_RTO_MAX); |
|---|
| 516 | 562 | goto out; |
|---|
| 517 | 563 | } |
|---|
| .. | .. |
|---|
| 532 | 578 | * the 120 second clamps though! |
|---|
| 533 | 579 | */ |
|---|
| 534 | 580 | icsk->icsk_backoff++; |
|---|
| 535 | | - icsk->icsk_retransmits++; |
|---|
| 536 | 581 | |
|---|
| 537 | 582 | out_reset_timer: |
|---|
| 538 | 583 | /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is |
|---|
| .. | .. |
|---|
| 545 | 590 | * linear-timeout retransmissions into a black hole |
|---|
| 546 | 591 | */ |
|---|
| 547 | 592 | if (sk->sk_state == TCP_ESTABLISHED && |
|---|
| 548 | | - (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && |
|---|
| 593 | + (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) && |
|---|
| 549 | 594 | tcp_stream_is_thin(tp) && |
|---|
| 550 | 595 | icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { |
|---|
| 551 | 596 | icsk->icsk_backoff = 0; |
|---|
| 552 | | - icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); |
|---|
| 597 | + icsk->icsk_rto = clamp(__tcp_set_rto(tp), |
|---|
| 598 | + tcp_rto_min(sk), |
|---|
| 599 | + TCP_RTO_MAX); |
|---|
| 553 | 600 | } else { |
|---|
| 554 | 601 | /* Use normal (exponential) backoff */ |
|---|
| 555 | 602 | icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
|---|
| 556 | 603 | } |
|---|
| 557 | 604 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
|---|
| 558 | 605 | tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); |
|---|
| 559 | | - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) |
|---|
| 606 | + if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) |
|---|
| 560 | 607 | __sk_dst_reset(sk); |
|---|
| 561 | 608 | |
|---|
| 562 | 609 | out:; |
|---|
| .. | .. |
|---|
| 735 | 782 | |
|---|
| 736 | 783 | bh_lock_sock(sk); |
|---|
| 737 | 784 | if (!sock_owned_by_user(sk)) { |
|---|
| 738 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
|---|
| 785 | + if (tp->compressed_ack) { |
|---|
| 786 | + /* Since we have to send one ack finally, |
|---|
| 787 | + * substract one from tp->compressed_ack to keep |
|---|
| 788 | + * LINUX_MIB_TCPACKCOMPRESSED accurate. |
|---|
| 789 | + */ |
|---|
| 790 | + tp->compressed_ack--; |
|---|
| 739 | 791 | tcp_send_ack(sk); |
|---|
| 792 | + } |
|---|
| 740 | 793 | } else { |
|---|
| 741 | 794 | if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, |
|---|
| 742 | 795 | &sk->sk_tsq_flags)) |
|---|