.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
---|
3 | 4 | * operating system. INET is implemented using the BSD Socket |
---|
.. | .. |
---|
37 | 38 | #define pr_fmt(fmt) "TCP: " fmt |
---|
38 | 39 | |
---|
39 | 40 | #include <net/tcp.h> |
---|
| 41 | +#include <net/mptcp.h> |
---|
40 | 42 | |
---|
41 | 43 | #include <linux/compiler.h> |
---|
42 | 44 | #include <linux/gfp.h> |
---|
.. | .. |
---|
44 | 46 | #include <linux/static_key.h> |
---|
45 | 47 | |
---|
46 | 48 | #include <trace/events/tcp.h> |
---|
| 49 | + |
---|
| 50 | +/* Refresh clocks of a TCP socket, |
---|
| 51 | + * ensuring monotically increasing values. |
---|
| 52 | + */ |
---|
| 53 | +void tcp_mstamp_refresh(struct tcp_sock *tp) |
---|
| 54 | +{ |
---|
| 55 | + u64 val = tcp_clock_ns(); |
---|
| 56 | + |
---|
| 57 | + tp->tcp_clock_cache = val; |
---|
| 58 | + tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); |
---|
| 59 | +} |
---|
47 | 60 | |
---|
48 | 61 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
---|
49 | 62 | int push_one, gfp_t gfp); |
---|
.. | .. |
---|
55 | 68 | struct tcp_sock *tp = tcp_sk(sk); |
---|
56 | 69 | unsigned int prior_packets = tp->packets_out; |
---|
57 | 70 | |
---|
58 | | - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; |
---|
| 71 | + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); |
---|
59 | 72 | |
---|
60 | 73 | __skb_unlink(skb, &sk->sk_write_queue); |
---|
61 | 74 | tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); |
---|
.. | .. |
---|
69 | 82 | |
---|
70 | 83 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, |
---|
71 | 84 | tcp_skb_pcount(skb)); |
---|
| 85 | + tcp_check_space(sk); |
---|
72 | 86 | } |
---|
73 | 87 | |
---|
74 | 88 | /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one |
---|
.. | .. |
---|
159 | 173 | * packet, enter pingpong mode. |
---|
160 | 174 | */ |
---|
161 | 175 | if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) |
---|
162 | | - icsk->icsk_ack.pingpong = 1; |
---|
| 176 | + inet_csk_enter_pingpong_mode(sk); |
---|
163 | 177 | } |
---|
164 | 178 | |
---|
165 | 179 | /* Account for an ACK we sent. */ |
---|
166 | | -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, |
---|
167 | | - u32 rcv_nxt) |
---|
| 180 | +static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) |
---|
168 | 181 | { |
---|
169 | 182 | struct tcp_sock *tp = tcp_sk(sk); |
---|
170 | 183 | |
---|
171 | | - if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { |
---|
| 184 | + if (unlikely(tp->compressed_ack)) { |
---|
172 | 185 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
---|
173 | | - tp->compressed_ack - TCP_FASTRETRANS_THRESH); |
---|
174 | | - tp->compressed_ack = TCP_FASTRETRANS_THRESH; |
---|
| 186 | + tp->compressed_ack); |
---|
| 187 | + tp->compressed_ack = 0; |
---|
175 | 188 | if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) |
---|
176 | 189 | __sock_put(sk); |
---|
177 | 190 | } |
---|
178 | 191 | |
---|
179 | 192 | if (unlikely(rcv_nxt != tp->rcv_nxt)) |
---|
180 | 193 | return; /* Special ACK sent by DCTCP to reflect ECN */ |
---|
181 | | - tcp_dec_quickack_mode(sk, pkts); |
---|
| 194 | + tcp_dec_quickack_mode(sk); |
---|
182 | 195 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); |
---|
183 | 196 | } |
---|
184 | 197 | |
---|
.. | .. |
---|
221 | 234 | if (init_rcv_wnd) |
---|
222 | 235 | *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); |
---|
223 | 236 | |
---|
224 | | - (*rcv_wscale) = 0; |
---|
| 237 | + *rcv_wscale = 0; |
---|
225 | 238 | if (wscale_ok) { |
---|
226 | 239 | /* Set window scaling on max possible window */ |
---|
227 | | - space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); |
---|
228 | | - space = max_t(u32, space, sysctl_rmem_max); |
---|
| 240 | + space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); |
---|
| 241 | + space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); |
---|
229 | 242 | space = min_t(u32, space, *window_clamp); |
---|
230 | | - while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { |
---|
231 | | - space >>= 1; |
---|
232 | | - (*rcv_wscale)++; |
---|
233 | | - } |
---|
| 243 | + *rcv_wscale = clamp_t(int, ilog2(space) - 15, |
---|
| 244 | + 0, TCP_MAX_WSCALE); |
---|
234 | 245 | } |
---|
235 | 246 | /* Set the clamp no higher than max representable value */ |
---|
236 | 247 | (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); |
---|
.. | .. |
---|
401 | 412 | #define OPTION_WSCALE (1 << 3) |
---|
402 | 413 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) |
---|
403 | 414 | #define OPTION_SMC (1 << 9) |
---|
| 415 | +#define OPTION_MPTCP (1 << 10) |
---|
404 | 416 | |
---|
405 | 417 | static void smc_options_write(__be32 *ptr, u16 *options) |
---|
406 | 418 | { |
---|
.. | .. |
---|
423 | 435 | u8 ws; /* window scale, 0 to disable */ |
---|
424 | 436 | u8 num_sack_blocks; /* number of SACK blocks to include */ |
---|
425 | 437 | u8 hash_size; /* bytes in hash_location */ |
---|
| 438 | + u8 bpf_opt_len; /* length of BPF hdr option */ |
---|
426 | 439 | __u8 *hash_location; /* temporary pointer, overloaded */ |
---|
427 | 440 | __u32 tsval, tsecr; /* need to include OPTION_TS */ |
---|
428 | 441 | struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ |
---|
| 442 | + struct mptcp_out_options mptcp; |
---|
429 | 443 | }; |
---|
| 444 | + |
---|
| 445 | +static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) |
---|
| 446 | +{ |
---|
| 447 | +#if IS_ENABLED(CONFIG_MPTCP) |
---|
| 448 | + if (unlikely(OPTION_MPTCP & opts->options)) |
---|
| 449 | + mptcp_write_options(ptr, &opts->mptcp); |
---|
| 450 | +#endif |
---|
| 451 | +} |
---|
| 452 | + |
---|
| 453 | +#ifdef CONFIG_CGROUP_BPF |
---|
| 454 | +static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, |
---|
| 455 | + enum tcp_synack_type synack_type) |
---|
| 456 | +{ |
---|
| 457 | + if (unlikely(!skb)) |
---|
| 458 | + return BPF_WRITE_HDR_TCP_CURRENT_MSS; |
---|
| 459 | + |
---|
| 460 | + if (unlikely(synack_type == TCP_SYNACK_COOKIE)) |
---|
| 461 | + return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; |
---|
| 462 | + |
---|
| 463 | + return 0; |
---|
| 464 | +} |
---|
| 465 | + |
---|
| 466 | +/* req, syn_skb and synack_type are used when writing synack */ |
---|
| 467 | +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, |
---|
| 468 | + struct request_sock *req, |
---|
| 469 | + struct sk_buff *syn_skb, |
---|
| 470 | + enum tcp_synack_type synack_type, |
---|
| 471 | + struct tcp_out_options *opts, |
---|
| 472 | + unsigned int *remaining) |
---|
| 473 | +{ |
---|
| 474 | + struct bpf_sock_ops_kern sock_ops; |
---|
| 475 | + int err; |
---|
| 476 | + |
---|
| 477 | + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
---|
| 478 | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || |
---|
| 479 | + !*remaining) |
---|
| 480 | + return; |
---|
| 481 | + |
---|
| 482 | + /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ |
---|
| 483 | + |
---|
| 484 | + /* init sock_ops */ |
---|
| 485 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
---|
| 486 | + |
---|
| 487 | + sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; |
---|
| 488 | + |
---|
| 489 | + if (req) { |
---|
| 490 | + /* The listen "sk" cannot be passed here because |
---|
| 491 | + * it is not locked. It would not make too much |
---|
| 492 | + * sense to do bpf_setsockopt(listen_sk) based |
---|
| 493 | + * on individual connection request also. |
---|
| 494 | + * |
---|
| 495 | + * Thus, "req" is passed here and the cgroup-bpf-progs |
---|
| 496 | + * of the listen "sk" will be run. |
---|
| 497 | + * |
---|
| 498 | + * "req" is also used here for fastopen even the "sk" here is |
---|
| 499 | + * a fullsock "child" sk. It is to keep the behavior |
---|
| 500 | + * consistent between fastopen and non-fastopen on |
---|
| 501 | + * the bpf programming side. |
---|
| 502 | + */ |
---|
| 503 | + sock_ops.sk = (struct sock *)req; |
---|
| 504 | + sock_ops.syn_skb = syn_skb; |
---|
| 505 | + } else { |
---|
| 506 | + sock_owned_by_me(sk); |
---|
| 507 | + |
---|
| 508 | + sock_ops.is_fullsock = 1; |
---|
| 509 | + sock_ops.sk = sk; |
---|
| 510 | + } |
---|
| 511 | + |
---|
| 512 | + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); |
---|
| 513 | + sock_ops.remaining_opt_len = *remaining; |
---|
| 514 | + /* tcp_current_mss() does not pass a skb */ |
---|
| 515 | + if (skb) |
---|
| 516 | + bpf_skops_init_skb(&sock_ops, skb, 0); |
---|
| 517 | + |
---|
| 518 | + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); |
---|
| 519 | + |
---|
| 520 | + if (err || sock_ops.remaining_opt_len == *remaining) |
---|
| 521 | + return; |
---|
| 522 | + |
---|
| 523 | + opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; |
---|
| 524 | + /* round up to 4 bytes */ |
---|
| 525 | + opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; |
---|
| 526 | + |
---|
| 527 | + *remaining -= opts->bpf_opt_len; |
---|
| 528 | +} |
---|
| 529 | + |
---|
| 530 | +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, |
---|
| 531 | + struct request_sock *req, |
---|
| 532 | + struct sk_buff *syn_skb, |
---|
| 533 | + enum tcp_synack_type synack_type, |
---|
| 534 | + struct tcp_out_options *opts) |
---|
| 535 | +{ |
---|
| 536 | + u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; |
---|
| 537 | + struct bpf_sock_ops_kern sock_ops; |
---|
| 538 | + int err; |
---|
| 539 | + |
---|
| 540 | + if (likely(!max_opt_len)) |
---|
| 541 | + return; |
---|
| 542 | + |
---|
| 543 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
---|
| 544 | + |
---|
| 545 | + sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; |
---|
| 546 | + |
---|
| 547 | + if (req) { |
---|
| 548 | + sock_ops.sk = (struct sock *)req; |
---|
| 549 | + sock_ops.syn_skb = syn_skb; |
---|
| 550 | + } else { |
---|
| 551 | + sock_owned_by_me(sk); |
---|
| 552 | + |
---|
| 553 | + sock_ops.is_fullsock = 1; |
---|
| 554 | + sock_ops.sk = sk; |
---|
| 555 | + } |
---|
| 556 | + |
---|
| 557 | + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); |
---|
| 558 | + sock_ops.remaining_opt_len = max_opt_len; |
---|
| 559 | + first_opt_off = tcp_hdrlen(skb) - max_opt_len; |
---|
| 560 | + bpf_skops_init_skb(&sock_ops, skb, first_opt_off); |
---|
| 561 | + |
---|
| 562 | + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); |
---|
| 563 | + |
---|
| 564 | + if (err) |
---|
| 565 | + nr_written = 0; |
---|
| 566 | + else |
---|
| 567 | + nr_written = max_opt_len - sock_ops.remaining_opt_len; |
---|
| 568 | + |
---|
| 569 | + if (nr_written < max_opt_len) |
---|
| 570 | + memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, |
---|
| 571 | + max_opt_len - nr_written); |
---|
| 572 | +} |
---|
| 573 | +#else |
---|
| 574 | +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, |
---|
| 575 | + struct request_sock *req, |
---|
| 576 | + struct sk_buff *syn_skb, |
---|
| 577 | + enum tcp_synack_type synack_type, |
---|
| 578 | + struct tcp_out_options *opts, |
---|
| 579 | + unsigned int *remaining) |
---|
| 580 | +{ |
---|
| 581 | +} |
---|
| 582 | + |
---|
| 583 | +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, |
---|
| 584 | + struct request_sock *req, |
---|
| 585 | + struct sk_buff *syn_skb, |
---|
| 586 | + enum tcp_synack_type synack_type, |
---|
| 587 | + struct tcp_out_options *opts) |
---|
| 588 | +{ |
---|
| 589 | +} |
---|
| 590 | +#endif |
---|
430 | 591 | |
---|
431 | 592 | /* Write previously computed TCP options to the packet. |
---|
432 | 593 | * |
---|
.. | .. |
---|
536 | 697 | } |
---|
537 | 698 | |
---|
538 | 699 | smc_options_write(ptr, &options); |
---|
| 700 | + |
---|
| 701 | + mptcp_options_write(ptr, opts); |
---|
539 | 702 | } |
---|
540 | 703 | |
---|
541 | 704 | static void smc_set_option(const struct tcp_sock *tp, |
---|
.. | .. |
---|
571 | 734 | #endif |
---|
572 | 735 | } |
---|
573 | 736 | |
---|
| 737 | +static void mptcp_set_option_cond(const struct request_sock *req, |
---|
| 738 | + struct tcp_out_options *opts, |
---|
| 739 | + unsigned int *remaining) |
---|
| 740 | +{ |
---|
| 741 | + if (rsk_is_mptcp(req)) { |
---|
| 742 | + unsigned int size; |
---|
| 743 | + |
---|
| 744 | + if (mptcp_synack_options(req, &size, &opts->mptcp)) { |
---|
| 745 | + if (*remaining >= size) { |
---|
| 746 | + opts->options |= OPTION_MPTCP; |
---|
| 747 | + *remaining -= size; |
---|
| 748 | + } |
---|
| 749 | + } |
---|
| 750 | + } |
---|
| 751 | +} |
---|
| 752 | + |
---|
574 | 753 | /* Compute TCP options for SYN packets. This is not the final |
---|
575 | 754 | * network wire format yet. |
---|
576 | 755 | */ |
---|
.. | .. |
---|
584 | 763 | |
---|
585 | 764 | *md5 = NULL; |
---|
586 | 765 | #ifdef CONFIG_TCP_MD5SIG |
---|
587 | | - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { |
---|
| 766 | + if (static_branch_unlikely(&tcp_md5_needed) && |
---|
| 767 | + rcu_access_pointer(tp->md5sig_info)) { |
---|
588 | 768 | *md5 = tp->af_specific->md5_lookup(sk, sk); |
---|
589 | 769 | if (*md5) { |
---|
590 | 770 | opts->options |= OPTION_MD5; |
---|
.. | .. |
---|
605 | 785 | opts->mss = tcp_advertise_mss(sk); |
---|
606 | 786 | remaining -= TCPOLEN_MSS_ALIGNED; |
---|
607 | 787 | |
---|
608 | | - if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) { |
---|
| 788 | + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { |
---|
609 | 789 | opts->options |= OPTION_TS; |
---|
610 | 790 | opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; |
---|
611 | 791 | opts->tsecr = tp->rx_opt.ts_recent; |
---|
612 | 792 | remaining -= TCPOLEN_TSTAMP_ALIGNED; |
---|
613 | 793 | } |
---|
614 | | - if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { |
---|
| 794 | + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { |
---|
615 | 795 | opts->ws = tp->rx_opt.rcv_wscale; |
---|
616 | 796 | opts->options |= OPTION_WSCALE; |
---|
617 | 797 | remaining -= TCPOLEN_WSCALE_ALIGNED; |
---|
618 | 798 | } |
---|
619 | | - if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) { |
---|
| 799 | + if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { |
---|
620 | 800 | opts->options |= OPTION_SACK_ADVERTISE; |
---|
621 | 801 | if (unlikely(!(OPTION_TS & opts->options))) |
---|
622 | 802 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
---|
.. | .. |
---|
639 | 819 | |
---|
640 | 820 | smc_set_option(tp, opts, &remaining); |
---|
641 | 821 | |
---|
| 822 | + if (sk_is_mptcp(sk)) { |
---|
| 823 | + unsigned int size; |
---|
| 824 | + |
---|
| 825 | + if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { |
---|
| 826 | + opts->options |= OPTION_MPTCP; |
---|
| 827 | + remaining -= size; |
---|
| 828 | + } |
---|
| 829 | + } |
---|
| 830 | + |
---|
| 831 | + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); |
---|
| 832 | + |
---|
642 | 833 | return MAX_TCP_OPTION_SPACE - remaining; |
---|
643 | 834 | } |
---|
644 | 835 | |
---|
.. | .. |
---|
649 | 840 | struct tcp_out_options *opts, |
---|
650 | 841 | const struct tcp_md5sig_key *md5, |
---|
651 | 842 | struct tcp_fastopen_cookie *foc, |
---|
652 | | - enum tcp_synack_type synack_type) |
---|
| 843 | + enum tcp_synack_type synack_type, |
---|
| 844 | + struct sk_buff *syn_skb) |
---|
653 | 845 | { |
---|
654 | 846 | struct inet_request_sock *ireq = inet_rsk(req); |
---|
655 | 847 | unsigned int remaining = MAX_TCP_OPTION_SPACE; |
---|
.. | .. |
---|
681 | 873 | if (likely(ireq->tstamp_ok)) { |
---|
682 | 874 | opts->options |= OPTION_TS; |
---|
683 | 875 | opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; |
---|
684 | | - opts->tsecr = req->ts_recent; |
---|
| 876 | + opts->tsecr = READ_ONCE(req->ts_recent); |
---|
685 | 877 | remaining -= TCPOLEN_TSTAMP_ALIGNED; |
---|
686 | 878 | } |
---|
687 | 879 | if (likely(ireq->sack_ok)) { |
---|
.. | .. |
---|
702 | 894 | } |
---|
703 | 895 | } |
---|
704 | 896 | |
---|
| 897 | + mptcp_set_option_cond(req, opts, &remaining); |
---|
| 898 | + |
---|
705 | 899 | smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); |
---|
| 900 | + |
---|
| 901 | + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, |
---|
| 902 | + synack_type, opts, &remaining); |
---|
706 | 903 | |
---|
707 | 904 | return MAX_TCP_OPTION_SPACE - remaining; |
---|
708 | 905 | } |
---|
.. | .. |
---|
722 | 919 | |
---|
723 | 920 | *md5 = NULL; |
---|
724 | 921 | #ifdef CONFIG_TCP_MD5SIG |
---|
725 | | - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { |
---|
| 922 | + if (static_branch_unlikely(&tcp_md5_needed) && |
---|
| 923 | + rcu_access_pointer(tp->md5sig_info)) { |
---|
726 | 924 | *md5 = tp->af_specific->md5_lookup(sk, sk); |
---|
727 | 925 | if (*md5) { |
---|
728 | 926 | opts->options |= OPTION_MD5; |
---|
.. | .. |
---|
738 | 936 | size += TCPOLEN_TSTAMP_ALIGNED; |
---|
739 | 937 | } |
---|
740 | 938 | |
---|
| 939 | + /* MPTCP options have precedence over SACK for the limited TCP |
---|
| 940 | + * option space because a MPTCP connection would be forced to |
---|
| 941 | + * fall back to regular TCP if a required multipath option is |
---|
| 942 | + * missing. SACK still gets a chance to use whatever space is |
---|
| 943 | + * left. |
---|
| 944 | + */ |
---|
| 945 | + if (sk_is_mptcp(sk)) { |
---|
| 946 | + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; |
---|
| 947 | + unsigned int opt_size = 0; |
---|
| 948 | + |
---|
| 949 | + if (mptcp_established_options(sk, skb, &opt_size, remaining, |
---|
| 950 | + &opts->mptcp)) { |
---|
| 951 | + opts->options |= OPTION_MPTCP; |
---|
| 952 | + size += opt_size; |
---|
| 953 | + } |
---|
| 954 | + } |
---|
| 955 | + |
---|
741 | 956 | eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; |
---|
742 | 957 | if (unlikely(eff_sacks)) { |
---|
743 | 958 | const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; |
---|
| 959 | + if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + |
---|
| 960 | + TCPOLEN_SACK_PERBLOCK)) |
---|
| 961 | + return size; |
---|
| 962 | + |
---|
744 | 963 | opts->num_sack_blocks = |
---|
745 | 964 | min_t(unsigned int, eff_sacks, |
---|
746 | 965 | (remaining - TCPOLEN_SACK_BASE_ALIGNED) / |
---|
747 | 966 | TCPOLEN_SACK_PERBLOCK); |
---|
748 | | - if (likely(opts->num_sack_blocks)) |
---|
749 | | - size += TCPOLEN_SACK_BASE_ALIGNED + |
---|
750 | | - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; |
---|
| 967 | + |
---|
| 968 | + size += TCPOLEN_SACK_BASE_ALIGNED + |
---|
| 969 | + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; |
---|
| 970 | + } |
---|
| 971 | + |
---|
| 972 | + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, |
---|
| 973 | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { |
---|
| 974 | + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; |
---|
| 975 | + |
---|
| 976 | + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); |
---|
| 977 | + |
---|
| 978 | + size = MAX_TCP_OPTION_SPACE - remaining; |
---|
751 | 979 | } |
---|
752 | 980 | |
---|
753 | 981 | return size; |
---|
.. | .. |
---|
966 | 1194 | return HRTIMER_NORESTART; |
---|
967 | 1195 | } |
---|
968 | 1196 | |
---|
969 | | -static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) |
---|
| 1197 | +static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, |
---|
| 1198 | + u64 prior_wstamp) |
---|
970 | 1199 | { |
---|
971 | 1200 | struct tcp_sock *tp = tcp_sk(sk); |
---|
972 | | - ktime_t expire, now; |
---|
973 | | - u64 len_ns; |
---|
974 | | - u32 rate; |
---|
975 | 1201 | |
---|
976 | | - if (!tcp_needs_internal_pacing(sk)) |
---|
977 | | - return; |
---|
978 | | - rate = sk->sk_pacing_rate; |
---|
979 | | - if (!rate || rate == ~0U) |
---|
980 | | - return; |
---|
| 1202 | + if (sk->sk_pacing_status != SK_PACING_NONE) { |
---|
| 1203 | + unsigned long rate = sk->sk_pacing_rate; |
---|
981 | 1204 | |
---|
982 | | - len_ns = (u64)skb->len * NSEC_PER_SEC; |
---|
983 | | - do_div(len_ns, rate); |
---|
984 | | - now = ktime_get(); |
---|
985 | | - /* If hrtimer is already armed, then our caller has not |
---|
986 | | - * used tcp_pacing_check(). |
---|
987 | | - */ |
---|
988 | | - if (unlikely(hrtimer_is_queued(&tp->pacing_timer))) { |
---|
989 | | - expire = hrtimer_get_softexpires(&tp->pacing_timer); |
---|
990 | | - if (ktime_after(expire, now)) |
---|
991 | | - now = expire; |
---|
992 | | - if (hrtimer_try_to_cancel(&tp->pacing_timer) == 1) |
---|
993 | | - __sock_put(sk); |
---|
| 1205 | + /* Original sch_fq does not pace first 10 MSS |
---|
| 1206 | + * Note that tp->data_segs_out overflows after 2^32 packets, |
---|
| 1207 | + * this is a minor annoyance. |
---|
| 1208 | + */ |
---|
| 1209 | + if (rate != ~0UL && rate && tp->data_segs_out >= 10) { |
---|
| 1210 | + u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); |
---|
| 1211 | + u64 credit = tp->tcp_wstamp_ns - prior_wstamp; |
---|
| 1212 | + |
---|
| 1213 | + /* take into account OS jitter */ |
---|
| 1214 | + len_ns -= min_t(u64, len_ns / 2, credit); |
---|
| 1215 | + tp->tcp_wstamp_ns += len_ns; |
---|
| 1216 | + } |
---|
994 | 1217 | } |
---|
995 | | - hrtimer_start(&tp->pacing_timer, ktime_add_ns(now, len_ns), |
---|
996 | | - HRTIMER_MODE_ABS_PINNED_SOFT); |
---|
997 | | - sock_hold(sk); |
---|
998 | | -} |
---|
999 | | - |
---|
1000 | | -static bool tcp_pacing_check(const struct sock *sk) |
---|
1001 | | -{ |
---|
1002 | | - return tcp_needs_internal_pacing(sk) && |
---|
1003 | | - hrtimer_is_queued(&tcp_sk(sk)->pacing_timer); |
---|
1004 | | -} |
---|
1005 | | - |
---|
1006 | | -static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) |
---|
1007 | | -{ |
---|
1008 | | - skb->skb_mstamp = tp->tcp_mstamp; |
---|
1009 | 1218 | list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); |
---|
1010 | 1219 | } |
---|
| 1220 | + |
---|
| 1221 | +INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); |
---|
| 1222 | +INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); |
---|
| 1223 | +INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); |
---|
1011 | 1224 | |
---|
1012 | 1225 | /* This routine actually transmits TCP packets queued in by |
---|
1013 | 1226 | * tcp_do_sendmsg(). This is used by both the initial |
---|
.. | .. |
---|
1032 | 1245 | struct sk_buff *oskb = NULL; |
---|
1033 | 1246 | struct tcp_md5sig_key *md5; |
---|
1034 | 1247 | struct tcphdr *th; |
---|
| 1248 | + u64 prior_wstamp; |
---|
1035 | 1249 | int err; |
---|
1036 | 1250 | |
---|
1037 | 1251 | BUG_ON(!skb || !tcp_skb_pcount(skb)); |
---|
1038 | 1252 | tp = tcp_sk(sk); |
---|
1039 | | - |
---|
| 1253 | + prior_wstamp = tp->tcp_wstamp_ns; |
---|
| 1254 | + tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); |
---|
| 1255 | + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; |
---|
1040 | 1256 | if (clone_it) { |
---|
1041 | 1257 | TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq |
---|
1042 | 1258 | - tp->snd_una; |
---|
.. | .. |
---|
1051 | 1267 | |
---|
1052 | 1268 | if (unlikely(!skb)) |
---|
1053 | 1269 | return -ENOBUFS; |
---|
| 1270 | + /* retransmit skbs might have a non zero value in skb->dev |
---|
| 1271 | + * because skb->dev is aliased with skb->rbnode.rb_left |
---|
| 1272 | + */ |
---|
| 1273 | + skb->dev = NULL; |
---|
1054 | 1274 | } |
---|
1055 | | - skb->skb_mstamp = tp->tcp_mstamp; |
---|
1056 | 1275 | |
---|
1057 | 1276 | inet = inet_sk(sk); |
---|
1058 | 1277 | tcb = TCP_SKB_CB(skb); |
---|
1059 | 1278 | memset(&opts, 0, sizeof(opts)); |
---|
1060 | 1279 | |
---|
1061 | | - if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) |
---|
| 1280 | + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { |
---|
1062 | 1281 | tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); |
---|
1063 | | - else |
---|
| 1282 | + } else { |
---|
1064 | 1283 | tcp_options_size = tcp_established_options(sk, skb, &opts, |
---|
1065 | 1284 | &md5); |
---|
| 1285 | + /* Force a PSH flag on all (GSO) packets to expedite GRO flush |
---|
| 1286 | + * at receiver : This slightly improve GRO performance. |
---|
| 1287 | + * Note that we do not force the PSH flag for non GSO packets, |
---|
| 1288 | + * because they might be sent under high congestion events, |
---|
| 1289 | + * and in this case it is better to delay the delivery of 1-MSS |
---|
| 1290 | + * packets and thus the corresponding ACK packet that would |
---|
| 1291 | + * release the following packet. |
---|
| 1292 | + */ |
---|
| 1293 | + if (tcp_skb_pcount(skb) > 1) |
---|
| 1294 | + tcb->tcp_flags |= TCPHDR_PSH; |
---|
| 1295 | + } |
---|
1066 | 1296 | tcp_header_size = tcp_options_size + sizeof(struct tcphdr); |
---|
1067 | 1297 | |
---|
1068 | 1298 | /* if no packet is in qdisc/device queue, then allow XPS to select |
---|
.. | .. |
---|
1135 | 1365 | } |
---|
1136 | 1366 | #endif |
---|
1137 | 1367 | |
---|
1138 | | - icsk->icsk_af_ops->send_check(sk, skb); |
---|
| 1368 | + /* BPF prog is the last one writing header option */ |
---|
| 1369 | + bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); |
---|
| 1370 | + |
---|
| 1371 | + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, |
---|
| 1372 | + tcp_v6_send_check, tcp_v4_send_check, |
---|
| 1373 | + sk, skb); |
---|
1139 | 1374 | |
---|
1140 | 1375 | if (likely(tcb->tcp_flags & TCPHDR_ACK)) |
---|
1141 | | - tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); |
---|
| 1376 | + tcp_event_ack_sent(sk, rcv_nxt); |
---|
1142 | 1377 | |
---|
1143 | 1378 | if (skb->len != tcp_header_size) { |
---|
1144 | 1379 | tcp_event_data_sent(tp, sk); |
---|
1145 | 1380 | tp->data_segs_out += tcp_skb_pcount(skb); |
---|
1146 | 1381 | tp->bytes_sent += skb->len - tcp_header_size; |
---|
1147 | | - tcp_internal_pacing(sk, skb); |
---|
1148 | 1382 | } |
---|
1149 | 1383 | |
---|
1150 | 1384 | if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) |
---|
.. | .. |
---|
1156 | 1390 | skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); |
---|
1157 | 1391 | skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); |
---|
1158 | 1392 | |
---|
1159 | | - /* Our usage of tstamp should remain private */ |
---|
1160 | | - skb->tstamp = 0; |
---|
| 1393 | + /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ |
---|
1161 | 1394 | |
---|
1162 | 1395 | /* Cleanup our debris for IP stacks */ |
---|
1163 | 1396 | memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), |
---|
1164 | 1397 | sizeof(struct inet6_skb_parm))); |
---|
1165 | 1398 | |
---|
1166 | | - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); |
---|
| 1399 | + tcp_add_tx_delay(skb, tp); |
---|
| 1400 | + |
---|
| 1401 | + err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, |
---|
| 1402 | + inet6_csk_xmit, ip_queue_xmit, |
---|
| 1403 | + sk, skb, &inet->cork.fl); |
---|
1167 | 1404 | |
---|
1168 | 1405 | if (unlikely(err > 0)) { |
---|
1169 | 1406 | tcp_enter_cwr(sk); |
---|
1170 | 1407 | err = net_xmit_eval(err); |
---|
1171 | 1408 | } |
---|
1172 | 1409 | if (!err && oskb) { |
---|
1173 | | - tcp_update_skb_after_send(tp, oskb); |
---|
| 1410 | + tcp_update_skb_after_send(sk, oskb, prior_wstamp); |
---|
1174 | 1411 | tcp_rate_skb_sent(sk, oskb); |
---|
1175 | 1412 | } |
---|
1176 | 1413 | return err; |
---|
.. | .. |
---|
1196 | 1433 | WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); |
---|
1197 | 1434 | __skb_header_release(skb); |
---|
1198 | 1435 | tcp_add_write_queue_tail(sk, skb); |
---|
1199 | | - sk->sk_wmem_queued += skb->truesize; |
---|
| 1436 | + sk_wmem_queued_add(sk, skb->truesize); |
---|
1200 | 1437 | sk_mem_charge(sk, skb->truesize); |
---|
1201 | 1438 | } |
---|
1202 | 1439 | |
---|
.. | .. |
---|
1321 | 1558 | return -ENOMEM; |
---|
1322 | 1559 | } |
---|
1323 | 1560 | |
---|
1324 | | - if (skb_unclone(skb, gfp)) |
---|
| 1561 | + if (skb_unclone_keeptruesize(skb, gfp)) |
---|
1325 | 1562 | return -ENOMEM; |
---|
1326 | 1563 | |
---|
1327 | 1564 | /* Get a new skb... force flag on. */ |
---|
1328 | 1565 | buff = sk_stream_alloc_skb(sk, nsize, gfp, true); |
---|
1329 | 1566 | if (!buff) |
---|
1330 | 1567 | return -ENOMEM; /* We'll just try again later. */ |
---|
| 1568 | + skb_copy_decrypted(buff, skb); |
---|
1331 | 1569 | |
---|
1332 | | - sk->sk_wmem_queued += buff->truesize; |
---|
| 1570 | + sk_wmem_queued_add(sk, buff->truesize); |
---|
1333 | 1571 | sk_mem_charge(sk, buff->truesize); |
---|
1334 | 1572 | nlen = skb->len - len - nsize; |
---|
1335 | 1573 | buff->truesize += nlen; |
---|
.. | .. |
---|
1410 | 1648 | } else { |
---|
1411 | 1649 | shinfo->frags[k] = shinfo->frags[i]; |
---|
1412 | 1650 | if (eat) { |
---|
1413 | | - shinfo->frags[k].page_offset += eat; |
---|
| 1651 | + skb_frag_off_add(&shinfo->frags[k], eat); |
---|
1414 | 1652 | skb_frag_size_sub(&shinfo->frags[k], eat); |
---|
1415 | 1653 | eat = 0; |
---|
1416 | 1654 | } |
---|
.. | .. |
---|
1429 | 1667 | { |
---|
1430 | 1668 | u32 delta_truesize; |
---|
1431 | 1669 | |
---|
1432 | | - if (skb_unclone(skb, GFP_ATOMIC)) |
---|
| 1670 | + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) |
---|
1433 | 1671 | return -ENOMEM; |
---|
1434 | 1672 | |
---|
1435 | 1673 | delta_truesize = __pskb_trim_head(skb, len); |
---|
.. | .. |
---|
1439 | 1677 | |
---|
1440 | 1678 | if (delta_truesize) { |
---|
1441 | 1679 | skb->truesize -= delta_truesize; |
---|
1442 | | - sk->sk_wmem_queued -= delta_truesize; |
---|
| 1680 | + sk_wmem_queued_add(sk, -delta_truesize); |
---|
1443 | 1681 | sk_mem_uncharge(sk, delta_truesize); |
---|
1444 | | - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); |
---|
1445 | 1682 | } |
---|
1446 | 1683 | |
---|
1447 | 1684 | /* Any change of skb->len requires recalculation of tso factor. */ |
---|
.. | .. |
---|
1479 | 1716 | mss_now -= icsk->icsk_ext_hdr_len; |
---|
1480 | 1717 | |
---|
1481 | 1718 | /* Then reserve room for full set of TCP options and 8 bytes of data */ |
---|
1482 | | - mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); |
---|
| 1719 | + mss_now = max(mss_now, |
---|
| 1720 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); |
---|
1483 | 1721 | return mss_now; |
---|
1484 | 1722 | } |
---|
1485 | 1723 | |
---|
.. | .. |
---|
1522 | 1760 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
1523 | 1761 | struct net *net = sock_net(sk); |
---|
1524 | 1762 | |
---|
1525 | | - icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1; |
---|
| 1763 | + icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; |
---|
1526 | 1764 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + |
---|
1527 | 1765 | icsk->icsk_af_ops->net_header_len; |
---|
1528 | | - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); |
---|
| 1766 | + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); |
---|
1529 | 1767 | icsk->icsk_mtup.probe_size = 0; |
---|
1530 | 1768 | if (icsk->icsk_mtup.enabled) |
---|
1531 | 1769 | icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; |
---|
.. | .. |
---|
1637 | 1875 | const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; |
---|
1638 | 1876 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1639 | 1877 | |
---|
1640 | | - /* Track the maximum number of outstanding packets in each |
---|
1641 | | - * window, and remember whether we were cwnd-limited then. |
---|
| 1878 | + /* Track the strongest available signal of the degree to which the cwnd |
---|
| 1879 | + * is fully utilized. If cwnd-limited then remember that fact for the |
---|
| 1880 | + * current window. If not cwnd-limited then track the maximum number of |
---|
| 1881 | + * outstanding packets in the current window. (If cwnd-limited then we |
---|
| 1882 | + * chose to not update tp->max_packets_out to avoid an extra else |
---|
| 1883 | + * clause with no functional impact.) |
---|
1642 | 1884 | */ |
---|
1643 | | - if (!before(tp->snd_una, tp->max_packets_seq) || |
---|
1644 | | - tp->packets_out > tp->max_packets_out || |
---|
1645 | | - is_cwnd_limited) { |
---|
1646 | | - tp->max_packets_out = tp->packets_out; |
---|
1647 | | - tp->max_packets_seq = tp->snd_nxt; |
---|
| 1885 | + if (!before(tp->snd_una, tp->cwnd_usage_seq) || |
---|
| 1886 | + is_cwnd_limited || |
---|
| 1887 | + (!tp->is_cwnd_limited && |
---|
| 1888 | + tp->packets_out > tp->max_packets_out)) { |
---|
1648 | 1889 | tp->is_cwnd_limited = is_cwnd_limited; |
---|
| 1890 | + tp->max_packets_out = tp->packets_out; |
---|
| 1891 | + tp->cwnd_usage_seq = tp->snd_nxt; |
---|
1649 | 1892 | } |
---|
1650 | 1893 | |
---|
1651 | 1894 | if (tcp_is_cwnd_limited(sk)) { |
---|
.. | .. |
---|
1657 | 1900 | if (tp->packets_out > tp->snd_cwnd_used) |
---|
1658 | 1901 | tp->snd_cwnd_used = tp->packets_out; |
---|
1659 | 1902 | |
---|
1660 | | - if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && |
---|
| 1903 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && |
---|
1661 | 1904 | (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && |
---|
1662 | 1905 | !ca_ops->cong_control) |
---|
1663 | 1906 | tcp_cwnd_application_limited(sk); |
---|
.. | .. |
---|
1721 | 1964 | { |
---|
1722 | 1965 | u32 bytes, segs; |
---|
1723 | 1966 | |
---|
1724 | | - bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, |
---|
1725 | | - sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); |
---|
| 1967 | + bytes = min_t(unsigned long, |
---|
| 1968 | + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), |
---|
| 1969 | + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); |
---|
1726 | 1970 | |
---|
1727 | 1971 | /* Goal is to send at least one packet per ms, |
---|
1728 | 1972 | * not one big TSO packet every 100 ms. |
---|
.. | .. |
---|
1744 | 1988 | |
---|
1745 | 1989 | min_tso = ca_ops->min_tso_segs ? |
---|
1746 | 1990 | ca_ops->min_tso_segs(sk) : |
---|
1747 | | - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; |
---|
| 1991 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); |
---|
1748 | 1992 | |
---|
1749 | 1993 | tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); |
---|
1750 | 1994 | return min_t(u32, tso_segs, sk->sk_gso_max_segs); |
---|
.. | .. |
---|
1868 | 2112 | * know that all the data is in scatter-gather pages, and that the |
---|
1869 | 2113 | * packet has never been sent out before (and thus is not cloned). |
---|
1870 | 2114 | */ |
---|
1871 | | -static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, |
---|
1872 | | - struct sk_buff *skb, unsigned int len, |
---|
| 2115 | +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, |
---|
1873 | 2116 | unsigned int mss_now, gfp_t gfp) |
---|
1874 | 2117 | { |
---|
1875 | | - struct sk_buff *buff; |
---|
1876 | 2118 | int nlen = skb->len - len; |
---|
| 2119 | + struct sk_buff *buff; |
---|
1877 | 2120 | u8 flags; |
---|
1878 | 2121 | |
---|
1879 | 2122 | /* All of a TSO frame must be composed of paged data. */ |
---|
1880 | 2123 | if (skb->len != skb->data_len) |
---|
1881 | | - return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); |
---|
| 2124 | + return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, |
---|
| 2125 | + skb, len, mss_now, gfp); |
---|
1882 | 2126 | |
---|
1883 | 2127 | buff = sk_stream_alloc_skb(sk, 0, gfp, true); |
---|
1884 | 2128 | if (unlikely(!buff)) |
---|
1885 | 2129 | return -ENOMEM; |
---|
| 2130 | + skb_copy_decrypted(buff, skb); |
---|
1886 | 2131 | |
---|
1887 | | - sk->sk_wmem_queued += buff->truesize; |
---|
| 2132 | + sk_wmem_queued_add(sk, buff->truesize); |
---|
1888 | 2133 | sk_mem_charge(sk, buff->truesize); |
---|
1889 | 2134 | buff->truesize += nlen; |
---|
1890 | 2135 | skb->truesize -= nlen; |
---|
.. | .. |
---|
1914 | 2159 | |
---|
1915 | 2160 | /* Link BUFF into the send queue. */ |
---|
1916 | 2161 | __skb_header_release(buff); |
---|
1917 | | - tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); |
---|
| 2162 | + tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); |
---|
1918 | 2163 | |
---|
1919 | 2164 | return 0; |
---|
1920 | 2165 | } |
---|
.. | .. |
---|
1930 | 2175 | u32 max_segs) |
---|
1931 | 2176 | { |
---|
1932 | 2177 | const struct inet_connection_sock *icsk = inet_csk(sk); |
---|
1933 | | - u32 age, send_win, cong_win, limit, in_flight; |
---|
| 2178 | + u32 send_win, cong_win, limit, in_flight; |
---|
1934 | 2179 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1935 | 2180 | struct sk_buff *head; |
---|
1936 | 2181 | int win_divisor; |
---|
| 2182 | + s64 delta; |
---|
1937 | 2183 | |
---|
1938 | 2184 | if (icsk->icsk_ca_state >= TCP_CA_Recovery) |
---|
1939 | 2185 | goto send_now; |
---|
1940 | 2186 | |
---|
1941 | 2187 | /* Avoid bursty behavior by allowing defer |
---|
1942 | | - * only if the last write was recent. |
---|
| 2188 | + * only if the last write was recent (1 ms). |
---|
| 2189 | + * Note that tp->tcp_wstamp_ns can be in the future if we have |
---|
| 2190 | + * packets waiting in a qdisc or device for EDT delivery. |
---|
1943 | 2191 | */ |
---|
1944 | | - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) |
---|
| 2192 | + delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; |
---|
| 2193 | + if (delta > 0) |
---|
1945 | 2194 | goto send_now; |
---|
1946 | 2195 | |
---|
1947 | 2196 | in_flight = tcp_packets_in_flight(tp); |
---|
.. | .. |
---|
1988 | 2237 | head = tcp_rtx_queue_head(sk); |
---|
1989 | 2238 | if (!head) |
---|
1990 | 2239 | goto send_now; |
---|
1991 | | - age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); |
---|
| 2240 | + delta = tp->tcp_clock_cache - head->tstamp; |
---|
1992 | 2241 | /* If next ACK is likely to come too late (half srtt), do not defer */ |
---|
1993 | | - if (age < (tp->srtt_us >> 4)) |
---|
| 2242 | + if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) |
---|
1994 | 2243 | goto send_now; |
---|
1995 | 2244 | |
---|
1996 | 2245 | /* Ok, it looks like it is advisable to defer. |
---|
.. | .. |
---|
2012 | 2261 | } |
---|
2013 | 2262 | |
---|
2014 | 2263 | /* If this packet won't get more data, do not wait. */ |
---|
2015 | | - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
---|
| 2264 | + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || |
---|
| 2265 | + TCP_SKB_CB(skb)->eor) |
---|
2016 | 2266 | goto send_now; |
---|
2017 | 2267 | |
---|
2018 | 2268 | return true; |
---|
.. | .. |
---|
2029 | 2279 | u32 interval; |
---|
2030 | 2280 | s32 delta; |
---|
2031 | 2281 | |
---|
2032 | | - interval = net->ipv4.sysctl_tcp_probe_interval; |
---|
| 2282 | + interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); |
---|
2033 | 2283 | delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; |
---|
2034 | 2284 | if (unlikely(delta >= interval * HZ)) { |
---|
2035 | 2285 | int mss = tcp_current_mss(sk); |
---|
.. | .. |
---|
2111 | 2361 | * probing process by not resetting search range to its orignal. |
---|
2112 | 2362 | */ |
---|
2113 | 2363 | if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || |
---|
2114 | | - interval < net->ipv4.sysctl_tcp_probe_threshold) { |
---|
| 2364 | + interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { |
---|
2115 | 2365 | /* Check whether enough time has elaplased for |
---|
2116 | 2366 | * another round of probing. |
---|
2117 | 2367 | */ |
---|
.. | .. |
---|
2139 | 2389 | if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) |
---|
2140 | 2390 | return -1; |
---|
2141 | 2391 | |
---|
2142 | | - if (tcp_pacing_check(sk)) |
---|
2143 | | - return -1; |
---|
2144 | | - |
---|
2145 | 2392 | /* We're allowed to probe. Build it now. */ |
---|
2146 | 2393 | nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); |
---|
2147 | 2394 | if (!nskb) |
---|
2148 | 2395 | return -1; |
---|
2149 | | - sk->sk_wmem_queued += nskb->truesize; |
---|
| 2396 | + sk_wmem_queued_add(sk, nskb->truesize); |
---|
2150 | 2397 | sk_mem_charge(sk, nskb->truesize); |
---|
2151 | 2398 | |
---|
2152 | 2399 | skb = tcp_send_head(sk); |
---|
| 2400 | + skb_copy_decrypted(nskb, skb); |
---|
2153 | 2401 | |
---|
2154 | 2402 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; |
---|
2155 | 2403 | TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; |
---|
.. | .. |
---|
2215 | 2463 | return -1; |
---|
2216 | 2464 | } |
---|
2217 | 2465 | |
---|
| 2466 | +static bool tcp_pacing_check(struct sock *sk) |
---|
| 2467 | +{ |
---|
| 2468 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2469 | + |
---|
| 2470 | + if (!tcp_needs_internal_pacing(sk)) |
---|
| 2471 | + return false; |
---|
| 2472 | + |
---|
| 2473 | + if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) |
---|
| 2474 | + return false; |
---|
| 2475 | + |
---|
| 2476 | + if (!hrtimer_is_queued(&tp->pacing_timer)) { |
---|
| 2477 | + hrtimer_start(&tp->pacing_timer, |
---|
| 2478 | + ns_to_ktime(tp->tcp_wstamp_ns), |
---|
| 2479 | + HRTIMER_MODE_ABS_PINNED_SOFT); |
---|
| 2480 | + sock_hold(sk); |
---|
| 2481 | + } |
---|
| 2482 | + return true; |
---|
| 2483 | +} |
---|
| 2484 | + |
---|
2218 | 2485 | /* TCP Small Queues : |
---|
2219 | 2486 | * Control number of packets in qdisc/devices to two packets / or ~1 ms. |
---|
2220 | 2487 | * (These limits are doubled for retransmits) |
---|
.. | .. |
---|
2229 | 2496 | static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, |
---|
2230 | 2497 | unsigned int factor) |
---|
2231 | 2498 | { |
---|
2232 | | - unsigned int limit; |
---|
| 2499 | + unsigned long limit; |
---|
2233 | 2500 | |
---|
2234 | | - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); |
---|
2235 | | - limit = min_t(u32, limit, |
---|
2236 | | - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); |
---|
| 2501 | + limit = max_t(unsigned long, |
---|
| 2502 | + 2 * skb->truesize, |
---|
| 2503 | + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); |
---|
| 2504 | + if (sk->sk_pacing_status == SK_PACING_NONE) |
---|
| 2505 | + limit = min_t(unsigned long, limit, |
---|
| 2506 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); |
---|
2237 | 2507 | limit <<= factor; |
---|
2238 | 2508 | |
---|
| 2509 | + if (static_branch_unlikely(&tcp_tx_delay_enabled) && |
---|
| 2510 | + tcp_sk(sk)->tcp_tx_delay) { |
---|
| 2511 | + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; |
---|
| 2512 | + |
---|
| 2513 | + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we |
---|
| 2514 | + * approximate our needs assuming an ~100% skb->truesize overhead. |
---|
| 2515 | + * USEC_PER_SEC is approximated by 2^20. |
---|
| 2516 | + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. |
---|
| 2517 | + */ |
---|
| 2518 | + extra_bytes >>= (20 - 1); |
---|
| 2519 | + limit += extra_bytes; |
---|
| 2520 | + } |
---|
2239 | 2521 | if (refcount_read(&sk->sk_wmem_alloc) > limit) { |
---|
2240 | 2522 | /* Always send skb if rtx queue is empty. |
---|
2241 | 2523 | * No need to wait for TX completion to call us back, |
---|
.. | .. |
---|
2341 | 2623 | while ((skb = tcp_send_head(sk))) { |
---|
2342 | 2624 | unsigned int limit; |
---|
2343 | 2625 | |
---|
| 2626 | + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { |
---|
| 2627 | + /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ |
---|
| 2628 | + skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; |
---|
| 2629 | + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); |
---|
| 2630 | + tcp_init_tso_segs(skb, mss_now); |
---|
| 2631 | + goto repair; /* Skip network transmission */ |
---|
| 2632 | + } |
---|
| 2633 | + |
---|
2344 | 2634 | if (tcp_pacing_check(sk)) |
---|
2345 | 2635 | break; |
---|
2346 | 2636 | |
---|
2347 | 2637 | tso_segs = tcp_init_tso_segs(skb, mss_now); |
---|
2348 | 2638 | BUG_ON(!tso_segs); |
---|
2349 | | - |
---|
2350 | | - if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { |
---|
2351 | | - /* "skb_mstamp" is used as a start point for the retransmit timer */ |
---|
2352 | | - tcp_update_skb_after_send(tp, skb); |
---|
2353 | | - goto repair; /* Skip network transmission */ |
---|
2354 | | - } |
---|
2355 | 2639 | |
---|
2356 | 2640 | cwnd_quota = tcp_cwnd_test(tp, skb); |
---|
2357 | 2641 | if (!cwnd_quota) { |
---|
.. | .. |
---|
2388 | 2672 | nonagle); |
---|
2389 | 2673 | |
---|
2390 | 2674 | if (skb->len > limit && |
---|
2391 | | - unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, |
---|
2392 | | - skb, limit, mss_now, gfp))) |
---|
| 2675 | + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) |
---|
2393 | 2676 | break; |
---|
2394 | 2677 | |
---|
2395 | 2678 | if (tcp_small_queue_check(sk, skb, 0)) |
---|
.. | .. |
---|
2450 | 2733 | /* Don't do any loss probe on a Fast Open connection before 3WHS |
---|
2451 | 2734 | * finishes. |
---|
2452 | 2735 | */ |
---|
2453 | | - if (tp->fastopen_rsk) |
---|
| 2736 | + if (rcu_access_pointer(tp->fastopen_rsk)) |
---|
2454 | 2737 | return false; |
---|
2455 | 2738 | |
---|
2456 | | - early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; |
---|
| 2739 | + early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); |
---|
2457 | 2740 | /* Schedule a loss probe in 2*RTT for SACK capable connections |
---|
2458 | 2741 | * not in loss recovery, that are either limited by cwnd or application. |
---|
2459 | 2742 | */ |
---|
.. | .. |
---|
2484 | 2767 | if (rto_delta_us > 0) |
---|
2485 | 2768 | timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); |
---|
2486 | 2769 | |
---|
2487 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, |
---|
2488 | | - TCP_RTO_MAX); |
---|
| 2770 | + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); |
---|
2489 | 2771 | return true; |
---|
2490 | 2772 | } |
---|
2491 | 2773 | |
---|
.. | .. |
---|
2666 | 2948 | int mss = icsk->icsk_ack.rcv_mss; |
---|
2667 | 2949 | int free_space = tcp_space(sk); |
---|
2668 | 2950 | int allowed_space = tcp_full_space(sk); |
---|
2669 | | - int full_space = min_t(int, tp->window_clamp, allowed_space); |
---|
2670 | | - int window; |
---|
| 2951 | + int full_space, window; |
---|
| 2952 | + |
---|
| 2953 | + if (sk_is_mptcp(sk)) |
---|
| 2954 | + mptcp_space(sk, &free_space, &allowed_space); |
---|
| 2955 | + |
---|
| 2956 | + full_space = min_t(int, tp->window_clamp, allowed_space); |
---|
2671 | 2957 | |
---|
2672 | 2958 | if (unlikely(mss > full_space)) { |
---|
2673 | 2959 | mss = full_space; |
---|
.. | .. |
---|
2815 | 3101 | struct sk_buff *skb = to, *tmp; |
---|
2816 | 3102 | bool first = true; |
---|
2817 | 3103 | |
---|
2818 | | - if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) |
---|
| 3104 | + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) |
---|
2819 | 3105 | return; |
---|
2820 | 3106 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) |
---|
2821 | 3107 | return; |
---|
.. | .. |
---|
2824 | 3110 | if (!tcp_can_collapse(sk, skb)) |
---|
2825 | 3111 | break; |
---|
2826 | 3112 | |
---|
2827 | | - if (!tcp_skb_can_collapse_to(to)) |
---|
| 3113 | + if (!tcp_skb_can_collapse(to, skb)) |
---|
2828 | 3114 | break; |
---|
2829 | 3115 | |
---|
2830 | 3116 | space -= skb->len; |
---|
.. | .. |
---|
2855 | 3141 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2856 | 3142 | unsigned int cur_mss; |
---|
2857 | 3143 | int diff, len, err; |
---|
2858 | | - |
---|
| 3144 | + int avail_wnd; |
---|
2859 | 3145 | |
---|
2860 | 3146 | /* Inconclusive MTU probe */ |
---|
2861 | 3147 | if (icsk->icsk_mtup.probe_size) |
---|
.. | .. |
---|
2885 | 3171 | return -EHOSTUNREACH; /* Routing failure or similar. */ |
---|
2886 | 3172 | |
---|
2887 | 3173 | cur_mss = tcp_current_mss(sk); |
---|
| 3174 | + avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; |
---|
2888 | 3175 | |
---|
2889 | 3176 | /* If receiver has shrunk his window, and skb is out of |
---|
2890 | 3177 | * new window, do not retransmit it. The exception is the |
---|
2891 | 3178 | * case, when window is shrunk to zero. In this case |
---|
2892 | | - * our retransmit serves as a zero window probe. |
---|
| 3179 | + * our retransmit of one segment serves as a zero window probe. |
---|
2893 | 3180 | */ |
---|
2894 | | - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && |
---|
2895 | | - TCP_SKB_CB(skb)->seq != tp->snd_una) |
---|
2896 | | - return -EAGAIN; |
---|
| 3181 | + if (avail_wnd <= 0) { |
---|
| 3182 | + if (TCP_SKB_CB(skb)->seq != tp->snd_una) |
---|
| 3183 | + return -EAGAIN; |
---|
| 3184 | + avail_wnd = cur_mss; |
---|
| 3185 | + } |
---|
2897 | 3186 | |
---|
2898 | 3187 | len = cur_mss * segs; |
---|
| 3188 | + if (len > avail_wnd) { |
---|
| 3189 | + len = rounddown(avail_wnd, cur_mss); |
---|
| 3190 | + if (!len) |
---|
| 3191 | + len = avail_wnd; |
---|
| 3192 | + } |
---|
2899 | 3193 | if (skb->len > len) { |
---|
2900 | 3194 | if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, |
---|
2901 | 3195 | cur_mss, GFP_ATOMIC)) |
---|
2902 | 3196 | return -ENOMEM; /* We'll try again later. */ |
---|
2903 | 3197 | } else { |
---|
2904 | | - if (skb_unclone(skb, GFP_ATOMIC)) |
---|
| 3198 | + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) |
---|
2905 | 3199 | return -ENOMEM; |
---|
2906 | 3200 | |
---|
2907 | 3201 | diff = tcp_skb_pcount(skb); |
---|
.. | .. |
---|
2909 | 3203 | diff -= tcp_skb_pcount(skb); |
---|
2910 | 3204 | if (diff) |
---|
2911 | 3205 | tcp_adjust_pcount(sk, skb, diff); |
---|
2912 | | - if (skb->len < cur_mss) |
---|
2913 | | - tcp_retrans_try_collapse(sk, skb, cur_mss); |
---|
| 3206 | + avail_wnd = min_t(int, avail_wnd, cur_mss); |
---|
| 3207 | + if (skb->len < avail_wnd) |
---|
| 3208 | + tcp_retrans_try_collapse(sk, skb, avail_wnd); |
---|
2914 | 3209 | } |
---|
2915 | 3210 | |
---|
2916 | 3211 | /* RFC3168, section 6.1.1.1. ECN fallback */ |
---|
.. | .. |
---|
2935 | 3230 | |
---|
2936 | 3231 | tcp_skb_tsorted_save(skb) { |
---|
2937 | 3232 | nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); |
---|
2938 | | - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : |
---|
2939 | | - -ENOBUFS; |
---|
| 3233 | + if (nskb) { |
---|
| 3234 | + nskb->dev = NULL; |
---|
| 3235 | + err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); |
---|
| 3236 | + } else { |
---|
| 3237 | + err = -ENOBUFS; |
---|
| 3238 | + } |
---|
2940 | 3239 | } tcp_skb_tsorted_restore(skb); |
---|
2941 | 3240 | |
---|
2942 | 3241 | if (!err) { |
---|
2943 | | - tcp_update_skb_after_send(tp, skb); |
---|
| 3242 | + tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); |
---|
2944 | 3243 | tcp_rate_skb_sent(sk, skb); |
---|
2945 | 3244 | } |
---|
2946 | 3245 | } else { |
---|
2947 | 3246 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
---|
2948 | 3247 | } |
---|
2949 | 3248 | |
---|
| 3249 | + /* To avoid taking spuriously low RTT samples based on a timestamp |
---|
| 3250 | + * for a transmit that never happened, always mark EVER_RETRANS |
---|
| 3251 | + */ |
---|
| 3252 | + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; |
---|
| 3253 | + |
---|
2950 | 3254 | if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) |
---|
2951 | 3255 | tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, |
---|
2952 | 3256 | TCP_SKB_CB(skb)->seq, segs, err); |
---|
2953 | 3257 | |
---|
2954 | 3258 | if (likely(!err)) { |
---|
2955 | | - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; |
---|
2956 | 3259 | trace_tcp_retransmit_skb(sk, skb); |
---|
2957 | 3260 | } else if (err != -EBUSY) { |
---|
2958 | 3261 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); |
---|
.. | .. |
---|
2995 | 3298 | const struct inet_connection_sock *icsk = inet_csk(sk); |
---|
2996 | 3299 | struct sk_buff *skb, *rtx_head, *hole = NULL; |
---|
2997 | 3300 | struct tcp_sock *tp = tcp_sk(sk); |
---|
| 3301 | + bool rearm_timer = false; |
---|
2998 | 3302 | u32 max_segs; |
---|
2999 | 3303 | int mib_idx; |
---|
3000 | 3304 | |
---|
.. | .. |
---|
3017 | 3321 | |
---|
3018 | 3322 | segs = tp->snd_cwnd - tcp_packets_in_flight(tp); |
---|
3019 | 3323 | if (segs <= 0) |
---|
3020 | | - return; |
---|
| 3324 | + break; |
---|
3021 | 3325 | sacked = TCP_SKB_CB(skb)->sacked; |
---|
3022 | 3326 | /* In case tcp_shift_skb_data() have aggregated large skbs, |
---|
3023 | 3327 | * we need to make sure not sending too bigs TSO packets |
---|
.. | .. |
---|
3042 | 3346 | continue; |
---|
3043 | 3347 | |
---|
3044 | 3348 | if (tcp_small_queue_check(sk, skb, 1)) |
---|
3045 | | - return; |
---|
| 3349 | + break; |
---|
3046 | 3350 | |
---|
3047 | 3351 | if (tcp_retransmit_skb(sk, skb, segs)) |
---|
3048 | | - return; |
---|
| 3352 | + break; |
---|
3049 | 3353 | |
---|
3050 | 3354 | NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); |
---|
3051 | 3355 | |
---|
.. | .. |
---|
3054 | 3358 | |
---|
3055 | 3359 | if (skb == rtx_head && |
---|
3056 | 3360 | icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) |
---|
3057 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
---|
3058 | | - inet_csk(sk)->icsk_rto, |
---|
3059 | | - TCP_RTO_MAX); |
---|
| 3361 | + rearm_timer = true; |
---|
| 3362 | + |
---|
3060 | 3363 | } |
---|
| 3364 | + if (rearm_timer) |
---|
| 3365 | + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
---|
| 3366 | + inet_csk(sk)->icsk_rto, |
---|
| 3367 | + TCP_RTO_MAX); |
---|
3061 | 3368 | } |
---|
3062 | 3369 | |
---|
3063 | 3370 | /* We allow to exceed memory limits for FIN packets to expedite |
---|
.. | .. |
---|
3069 | 3376 | */ |
---|
3070 | 3377 | void sk_forced_mem_schedule(struct sock *sk, int size) |
---|
3071 | 3378 | { |
---|
3072 | | - int amt; |
---|
| 3379 | + int delta, amt; |
---|
3073 | 3380 | |
---|
3074 | | - if (size <= sk->sk_forward_alloc) |
---|
| 3381 | + delta = size - sk->sk_forward_alloc; |
---|
| 3382 | + if (delta <= 0) |
---|
3075 | 3383 | return; |
---|
3076 | | - amt = sk_mem_pages(size); |
---|
| 3384 | + amt = sk_mem_pages(delta); |
---|
3077 | 3385 | sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; |
---|
3078 | 3386 | sk_memory_allocated_add(sk, amt); |
---|
3079 | 3387 | |
---|
.. | .. |
---|
3086 | 3394 | */ |
---|
3087 | 3395 | void tcp_send_fin(struct sock *sk) |
---|
3088 | 3396 | { |
---|
3089 | | - struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); |
---|
| 3397 | + struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); |
---|
3090 | 3398 | struct tcp_sock *tp = tcp_sk(sk); |
---|
3091 | 3399 | |
---|
3092 | 3400 | /* Optimization, tack on the FIN if we have one skb in write queue and |
---|
.. | .. |
---|
3094 | 3402 | * Note: in the latter case, FIN packet will be sent after a timeout, |
---|
3095 | 3403 | * as TCP stack thinks it has already been transmitted. |
---|
3096 | 3404 | */ |
---|
| 3405 | + tskb = tail; |
---|
3097 | 3406 | if (!tskb && tcp_under_memory_pressure(sk)) |
---|
3098 | 3407 | tskb = skb_rb_last(&sk->tcp_rtx_queue); |
---|
3099 | 3408 | |
---|
3100 | 3409 | if (tskb) { |
---|
3101 | | -coalesce: |
---|
3102 | 3410 | TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; |
---|
3103 | 3411 | TCP_SKB_CB(tskb)->end_seq++; |
---|
3104 | 3412 | tp->write_seq++; |
---|
3105 | | - if (tcp_write_queue_empty(sk)) { |
---|
| 3413 | + if (!tail) { |
---|
3106 | 3414 | /* This means tskb was already sent. |
---|
3107 | 3415 | * Pretend we included the FIN on previous transmit. |
---|
3108 | 3416 | * We need to set tp->snd_nxt to the value it would have |
---|
3109 | 3417 | * if FIN had been sent. This is because retransmit path |
---|
3110 | 3418 | * does not change tp->snd_nxt. |
---|
3111 | 3419 | */ |
---|
3112 | | - tp->snd_nxt++; |
---|
| 3420 | + WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); |
---|
3113 | 3421 | return; |
---|
3114 | 3422 | } |
---|
3115 | 3423 | } else { |
---|
3116 | 3424 | skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); |
---|
3117 | | - if (unlikely(!skb)) { |
---|
3118 | | - if (tskb) |
---|
3119 | | - goto coalesce; |
---|
| 3425 | + if (unlikely(!skb)) |
---|
3120 | 3426 | return; |
---|
3121 | | - } |
---|
| 3427 | + |
---|
3122 | 3428 | INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); |
---|
3123 | 3429 | skb_reserve(skb, MAX_TCP_HEADER); |
---|
3124 | 3430 | sk_forced_mem_schedule(sk, skb->truesize); |
---|
.. | .. |
---|
3192 | 3498 | tcp_rtx_queue_unlink_and_free(skb, sk); |
---|
3193 | 3499 | __skb_header_release(nskb); |
---|
3194 | 3500 | tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); |
---|
3195 | | - sk->sk_wmem_queued += nskb->truesize; |
---|
| 3501 | + sk_wmem_queued_add(sk, nskb->truesize); |
---|
3196 | 3502 | sk_mem_charge(sk, nskb->truesize); |
---|
3197 | 3503 | skb = nskb; |
---|
3198 | 3504 | } |
---|
.. | .. |
---|
3204 | 3510 | } |
---|
3205 | 3511 | |
---|
3206 | 3512 | /** |
---|
3207 | | - * tcp_make_synack - Prepare a SYN-ACK. |
---|
3208 | | - * sk: listener socket |
---|
3209 | | - * dst: dst entry attached to the SYNACK |
---|
3210 | | - * req: request_sock pointer |
---|
3211 | | - * |
---|
3212 | | - * Allocate one skb and build a SYNACK packet. |
---|
3213 | | - * @dst is consumed : Caller should not use it again. |
---|
| 3513 | + * tcp_make_synack - Allocate one skb and build a SYNACK packet. |
---|
| 3514 | + * @sk: listener socket |
---|
| 3515 | + * @dst: dst entry attached to the SYNACK. It is consumed and caller |
---|
| 3516 | + * should not use it again. |
---|
| 3517 | + * @req: request_sock pointer |
---|
| 3518 | + * @foc: cookie for tcp fast open |
---|
| 3519 | + * @synack_type: Type of synack to prepare |
---|
| 3520 | + * @syn_skb: SYN packet just received. It could be NULL for rtx case. |
---|
3214 | 3521 | */ |
---|
3215 | 3522 | struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, |
---|
3216 | 3523 | struct request_sock *req, |
---|
3217 | 3524 | struct tcp_fastopen_cookie *foc, |
---|
3218 | | - enum tcp_synack_type synack_type) |
---|
| 3525 | + enum tcp_synack_type synack_type, |
---|
| 3526 | + struct sk_buff *syn_skb) |
---|
3219 | 3527 | { |
---|
3220 | 3528 | struct inet_request_sock *ireq = inet_rsk(req); |
---|
3221 | 3529 | const struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
3225 | 3533 | int tcp_header_size; |
---|
3226 | 3534 | struct tcphdr *th; |
---|
3227 | 3535 | int mss; |
---|
| 3536 | + u64 now; |
---|
3228 | 3537 | |
---|
3229 | 3538 | skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); |
---|
3230 | 3539 | if (unlikely(!skb)) { |
---|
.. | .. |
---|
3256 | 3565 | mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); |
---|
3257 | 3566 | |
---|
3258 | 3567 | memset(&opts, 0, sizeof(opts)); |
---|
| 3568 | + now = tcp_clock_ns(); |
---|
3259 | 3569 | #ifdef CONFIG_SYN_COOKIES |
---|
3260 | | - if (unlikely(req->cookie_ts)) |
---|
3261 | | - skb->skb_mstamp = cookie_init_timestamp(req); |
---|
| 3570 | + if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) |
---|
| 3571 | + skb->skb_mstamp_ns = cookie_init_timestamp(req, now); |
---|
3262 | 3572 | else |
---|
3263 | 3573 | #endif |
---|
3264 | | - skb->skb_mstamp = tcp_clock_us(); |
---|
| 3574 | + { |
---|
| 3575 | + skb->skb_mstamp_ns = now; |
---|
| 3576 | + if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ |
---|
| 3577 | + tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); |
---|
| 3578 | + } |
---|
3265 | 3579 | |
---|
3266 | 3580 | #ifdef CONFIG_TCP_MD5SIG |
---|
3267 | 3581 | rcu_read_lock(); |
---|
3268 | 3582 | md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); |
---|
3269 | 3583 | #endif |
---|
3270 | 3584 | skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); |
---|
| 3585 | + /* bpf program will be interested in the tcp_flags */ |
---|
| 3586 | + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; |
---|
3271 | 3587 | tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, |
---|
3272 | | - foc, synack_type) + sizeof(*th); |
---|
| 3588 | + foc, synack_type, |
---|
| 3589 | + syn_skb) + sizeof(*th); |
---|
3273 | 3590 | |
---|
3274 | 3591 | skb_push(skb, tcp_header_size); |
---|
3275 | 3592 | skb_reset_transport_header(skb); |
---|
.. | .. |
---|
3291 | 3608 | th->window = htons(min(req->rsk_rcv_wnd, 65535U)); |
---|
3292 | 3609 | tcp_options_write((__be32 *)(th + 1), NULL, &opts); |
---|
3293 | 3610 | th->doff = (tcp_header_size >> 2); |
---|
3294 | | - __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); |
---|
| 3611 | + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); |
---|
3295 | 3612 | |
---|
3296 | 3613 | #ifdef CONFIG_TCP_MD5SIG |
---|
3297 | 3614 | /* Okay, we have all we need - do the md5 hash if needed */ |
---|
.. | .. |
---|
3301 | 3618 | rcu_read_unlock(); |
---|
3302 | 3619 | #endif |
---|
3303 | 3620 | |
---|
3304 | | - /* Do not fool tcpdump (if any), clean our debris */ |
---|
3305 | | - skb->tstamp = 0; |
---|
| 3621 | + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, |
---|
| 3622 | + synack_type, &opts); |
---|
| 3623 | + |
---|
| 3624 | + skb->skb_mstamp_ns = now; |
---|
| 3625 | + tcp_add_tx_delay(skb, tp); |
---|
| 3626 | + |
---|
3306 | 3627 | return skb; |
---|
3307 | 3628 | } |
---|
3308 | 3629 | EXPORT_SYMBOL(tcp_make_synack); |
---|
.. | .. |
---|
3318 | 3639 | |
---|
3319 | 3640 | rcu_read_lock(); |
---|
3320 | 3641 | ca = tcp_ca_find_key(ca_key); |
---|
3321 | | - if (likely(ca && try_module_get(ca->owner))) { |
---|
3322 | | - module_put(icsk->icsk_ca_ops->owner); |
---|
| 3642 | + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { |
---|
| 3643 | + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); |
---|
3323 | 3644 | icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); |
---|
3324 | 3645 | icsk->icsk_ca_ops = ca; |
---|
3325 | 3646 | } |
---|
.. | .. |
---|
3338 | 3659 | * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. |
---|
3339 | 3660 | */ |
---|
3340 | 3661 | tp->tcp_header_len = sizeof(struct tcphdr); |
---|
3341 | | - if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) |
---|
| 3662 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) |
---|
3342 | 3663 | tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; |
---|
3343 | 3664 | |
---|
3344 | 3665 | #ifdef CONFIG_TCP_MD5SIG |
---|
.. | .. |
---|
3374 | 3695 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
---|
3375 | 3696 | &tp->rcv_wnd, |
---|
3376 | 3697 | &tp->window_clamp, |
---|
3377 | | - sock_net(sk)->ipv4.sysctl_tcp_window_scaling, |
---|
| 3698 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), |
---|
3378 | 3699 | &rcv_wscale, |
---|
3379 | 3700 | rcv_wnd); |
---|
3380 | 3701 | |
---|
.. | .. |
---|
3389 | 3710 | tp->snd_una = tp->write_seq; |
---|
3390 | 3711 | tp->snd_sml = tp->write_seq; |
---|
3391 | 3712 | tp->snd_up = tp->write_seq; |
---|
3392 | | - tp->snd_nxt = tp->write_seq; |
---|
| 3713 | + WRITE_ONCE(tp->snd_nxt, tp->write_seq); |
---|
3393 | 3714 | |
---|
3394 | 3715 | if (likely(!tp->repair)) |
---|
3395 | 3716 | tp->rcv_nxt = 0; |
---|
.. | .. |
---|
3410 | 3731 | |
---|
3411 | 3732 | tcb->end_seq += skb->len; |
---|
3412 | 3733 | __skb_header_release(skb); |
---|
3413 | | - sk->sk_wmem_queued += skb->truesize; |
---|
| 3734 | + sk_wmem_queued_add(sk, skb->truesize); |
---|
3414 | 3735 | sk_mem_charge(sk, skb->truesize); |
---|
3415 | 3736 | WRITE_ONCE(tp->write_seq, tcb->end_seq); |
---|
3416 | 3737 | tp->packets_out += tcp_skb_pcount(skb); |
---|
.. | .. |
---|
3425 | 3746 | */ |
---|
3426 | 3747 | static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) |
---|
3427 | 3748 | { |
---|
| 3749 | + struct inet_connection_sock *icsk = inet_csk(sk); |
---|
3428 | 3750 | struct tcp_sock *tp = tcp_sk(sk); |
---|
3429 | 3751 | struct tcp_fastopen_request *fo = tp->fastopen_req; |
---|
3430 | 3752 | int space, err = 0; |
---|
.. | .. |
---|
3439 | 3761 | * private TCP options. The cost is reduced data space in SYN :( |
---|
3440 | 3762 | */ |
---|
3441 | 3763 | tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); |
---|
| 3764 | + /* Sync mss_cache after updating the mss_clamp */ |
---|
| 3765 | + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
---|
3442 | 3766 | |
---|
3443 | | - space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - |
---|
| 3767 | + space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - |
---|
3444 | 3768 | MAX_TCP_OPTION_SPACE; |
---|
3445 | 3769 | |
---|
3446 | 3770 | space = min_t(size_t, space, fo->size); |
---|
.. | .. |
---|
3465 | 3789 | skb_trim(syn_data, copied); |
---|
3466 | 3790 | space = copied; |
---|
3467 | 3791 | } |
---|
| 3792 | + skb_zcopy_set(syn_data, fo->uarg, NULL); |
---|
3468 | 3793 | } |
---|
3469 | 3794 | /* No more data pending in inet_wait_for_connect() */ |
---|
3470 | 3795 | if (space == fo->size) |
---|
.. | .. |
---|
3477 | 3802 | |
---|
3478 | 3803 | err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); |
---|
3479 | 3804 | |
---|
3480 | | - syn->skb_mstamp = syn_data->skb_mstamp; |
---|
| 3805 | + syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; |
---|
3481 | 3806 | |
---|
3482 | 3807 | /* Now full SYN+DATA was cloned and sent (or not), |
---|
3483 | 3808 | * remove the SYN from the original skb (syn_data) |
---|
.. | .. |
---|
3548 | 3873 | /* We change tp->snd_nxt after the tcp_transmit_skb() call |
---|
3549 | 3874 | * in order to make this packet get counted in tcpOutSegs. |
---|
3550 | 3875 | */ |
---|
3551 | | - tp->snd_nxt = tp->write_seq; |
---|
| 3876 | + WRITE_ONCE(tp->snd_nxt, tp->write_seq); |
---|
3552 | 3877 | tp->pushed_seq = tp->write_seq; |
---|
3553 | 3878 | buff = tcp_send_head(sk); |
---|
3554 | 3879 | if (unlikely(buff)) { |
---|
3555 | | - tp->snd_nxt = TCP_SKB_CB(buff)->seq; |
---|
| 3880 | + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); |
---|
3556 | 3881 | tp->pushed_seq = TCP_SKB_CB(buff)->seq; |
---|
3557 | 3882 | } |
---|
3558 | 3883 | TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); |
---|
.. | .. |
---|
3578 | 3903 | const struct tcp_sock *tp = tcp_sk(sk); |
---|
3579 | 3904 | int max_ato = HZ / 2; |
---|
3580 | 3905 | |
---|
3581 | | - if (icsk->icsk_ack.pingpong || |
---|
| 3906 | + if (inet_csk_in_pingpong_mode(sk) || |
---|
3582 | 3907 | (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) |
---|
3583 | 3908 | max_ato = TCP_DELACK_MAX; |
---|
3584 | 3909 | |
---|
.. | .. |
---|
3599 | 3924 | ato = min(ato, max_ato); |
---|
3600 | 3925 | } |
---|
3601 | 3926 | |
---|
| 3927 | + ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); |
---|
| 3928 | + |
---|
3602 | 3929 | /* Stay within the limit we were given */ |
---|
3603 | 3930 | timeout = jiffies + ato; |
---|
3604 | 3931 | |
---|
3605 | 3932 | /* Use new timeout only if there wasn't a older one earlier. */ |
---|
3606 | 3933 | if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { |
---|
3607 | | - /* If delack timer was blocked or is about to expire, |
---|
3608 | | - * send ACK now. |
---|
3609 | | - */ |
---|
3610 | | - if (icsk->icsk_ack.blocked || |
---|
3611 | | - time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { |
---|
| 3934 | + /* If delack timer is about to expire, send ACK now. */ |
---|
| 3935 | + if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { |
---|
3612 | 3936 | tcp_send_ack(sk); |
---|
3613 | 3937 | return; |
---|
3614 | 3938 | } |
---|
.. | .. |
---|
3637 | 3961 | buff = alloc_skb(MAX_TCP_HEADER, |
---|
3638 | 3962 | sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); |
---|
3639 | 3963 | if (unlikely(!buff)) { |
---|
| 3964 | + struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 3965 | + unsigned long delay; |
---|
| 3966 | + |
---|
| 3967 | + delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; |
---|
| 3968 | + if (delay < TCP_RTO_MAX) |
---|
| 3969 | + icsk->icsk_ack.retry++; |
---|
3640 | 3970 | inet_csk_schedule_ack(sk); |
---|
3641 | | - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; |
---|
3642 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
---|
3643 | | - TCP_DELACK_MAX, TCP_RTO_MAX); |
---|
| 3971 | + icsk->icsk_ack.ato = TCP_ATO_MIN; |
---|
| 3972 | + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); |
---|
3644 | 3973 | return; |
---|
3645 | 3974 | } |
---|
3646 | 3975 | |
---|
.. | .. |
---|
3759 | 4088 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
3760 | 4089 | struct tcp_sock *tp = tcp_sk(sk); |
---|
3761 | 4090 | struct net *net = sock_net(sk); |
---|
3762 | | - unsigned long probe_max; |
---|
| 4091 | + unsigned long timeout; |
---|
3763 | 4092 | int err; |
---|
3764 | 4093 | |
---|
3765 | 4094 | err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); |
---|
.. | .. |
---|
3768 | 4097 | /* Cancel probe timer, if it is not required. */ |
---|
3769 | 4098 | icsk->icsk_probes_out = 0; |
---|
3770 | 4099 | icsk->icsk_backoff = 0; |
---|
| 4100 | + icsk->icsk_probes_tstamp = 0; |
---|
3771 | 4101 | return; |
---|
3772 | 4102 | } |
---|
3773 | 4103 | |
---|
| 4104 | + icsk->icsk_probes_out++; |
---|
3774 | 4105 | if (err <= 0) { |
---|
3775 | | - if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) |
---|
| 4106 | + if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) |
---|
3776 | 4107 | icsk->icsk_backoff++; |
---|
3777 | | - icsk->icsk_probes_out++; |
---|
3778 | | - probe_max = TCP_RTO_MAX; |
---|
| 4108 | + timeout = tcp_probe0_when(sk, TCP_RTO_MAX); |
---|
3779 | 4109 | } else { |
---|
3780 | 4110 | /* If packet was not sent due to local congestion, |
---|
3781 | | - * do not backoff and do not remember icsk_probes_out. |
---|
3782 | | - * Let local senders to fight for local resources. |
---|
3783 | | - * |
---|
3784 | | - * Use accumulated backoff yet. |
---|
| 4111 | + * Let senders fight for local resources conservatively. |
---|
3785 | 4112 | */ |
---|
3786 | | - if (!icsk->icsk_probes_out) |
---|
3787 | | - icsk->icsk_probes_out = 1; |
---|
3788 | | - probe_max = TCP_RESOURCE_PROBE_INTERVAL; |
---|
| 4113 | + timeout = TCP_RESOURCE_PROBE_INTERVAL; |
---|
3789 | 4114 | } |
---|
3790 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
---|
3791 | | - tcp_probe0_when(sk, probe_max), |
---|
3792 | | - TCP_RTO_MAX); |
---|
| 4115 | + |
---|
| 4116 | + timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); |
---|
| 4117 | + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); |
---|
3793 | 4118 | } |
---|
3794 | 4119 | |
---|
3795 | 4120 | int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) |
---|
.. | .. |
---|
3799 | 4124 | int res; |
---|
3800 | 4125 | |
---|
3801 | 4126 | tcp_rsk(req)->txhash = net_tx_rndhash(); |
---|
3802 | | - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); |
---|
| 4127 | + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, |
---|
| 4128 | + NULL); |
---|
3803 | 4129 | if (!res) { |
---|
3804 | | - __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); |
---|
3805 | | - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); |
---|
| 4130 | + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); |
---|
| 4131 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); |
---|
3806 | 4132 | if (unlikely(tcp_passive_fastopen(sk))) |
---|
3807 | 4133 | tcp_sk(sk)->total_retrans++; |
---|
3808 | 4134 | trace_tcp_retransmit_synack(sk, req); |
---|