| .. | .. |
|---|
| 40 | 40 | |
|---|
| 41 | 41 | struct tcp_metrics_block { |
|---|
| 42 | 42 | struct tcp_metrics_block __rcu *tcpm_next; |
|---|
| 43 | | - possible_net_t tcpm_net; |
|---|
| 43 | + struct net *tcpm_net; |
|---|
| 44 | 44 | struct inetpeer_addr tcpm_saddr; |
|---|
| 45 | 45 | struct inetpeer_addr tcpm_daddr; |
|---|
| 46 | 46 | unsigned long tcpm_stamp; |
|---|
| .. | .. |
|---|
| 51 | 51 | struct rcu_head rcu_head; |
|---|
| 52 | 52 | }; |
|---|
| 53 | 53 | |
|---|
| 54 | | -static inline struct net *tm_net(struct tcp_metrics_block *tm) |
|---|
| 54 | +static inline struct net *tm_net(const struct tcp_metrics_block *tm) |
|---|
| 55 | 55 | { |
|---|
| 56 | | - return read_pnet(&tm->tcpm_net); |
|---|
| 56 | + /* Paired with the WRITE_ONCE() in tcpm_new() */ |
|---|
| 57 | + return READ_ONCE(tm->tcpm_net); |
|---|
| 57 | 58 | } |
|---|
| 58 | 59 | |
|---|
| 59 | 60 | static bool tcp_metric_locked(struct tcp_metrics_block *tm, |
|---|
| 60 | 61 | enum tcp_metric_index idx) |
|---|
| 61 | 62 | { |
|---|
| 62 | | - return tm->tcpm_lock & (1 << idx); |
|---|
| 63 | + /* Paired with WRITE_ONCE() in tcpm_suck_dst() */ |
|---|
| 64 | + return READ_ONCE(tm->tcpm_lock) & (1 << idx); |
|---|
| 63 | 65 | } |
|---|
| 64 | 66 | |
|---|
| 65 | | -static u32 tcp_metric_get(struct tcp_metrics_block *tm, |
|---|
| 67 | +static u32 tcp_metric_get(const struct tcp_metrics_block *tm, |
|---|
| 66 | 68 | enum tcp_metric_index idx) |
|---|
| 67 | 69 | { |
|---|
| 68 | | - return tm->tcpm_vals[idx]; |
|---|
| 70 | + /* Paired with WRITE_ONCE() in tcp_metric_set() */ |
|---|
| 71 | + return READ_ONCE(tm->tcpm_vals[idx]); |
|---|
| 69 | 72 | } |
|---|
| 70 | 73 | |
|---|
| 71 | 74 | static void tcp_metric_set(struct tcp_metrics_block *tm, |
|---|
| 72 | 75 | enum tcp_metric_index idx, |
|---|
| 73 | 76 | u32 val) |
|---|
| 74 | 77 | { |
|---|
| 75 | | - tm->tcpm_vals[idx] = val; |
|---|
| 78 | + /* Paired with READ_ONCE() in tcp_metric_get() */ |
|---|
| 79 | + WRITE_ONCE(tm->tcpm_vals[idx], val); |
|---|
| 76 | 80 | } |
|---|
| 77 | 81 | |
|---|
| 78 | 82 | static bool addr_same(const struct inetpeer_addr *a, |
|---|
| 79 | 83 | const struct inetpeer_addr *b) |
|---|
| 80 | 84 | { |
|---|
| 81 | | - return inetpeer_addr_cmp(a, b) == 0; |
|---|
| 85 | + return (a->family == b->family) && !inetpeer_addr_cmp(a, b); |
|---|
| 82 | 86 | } |
|---|
| 83 | 87 | |
|---|
| 84 | 88 | struct tcpm_hash_bucket { |
|---|
| .. | .. |
|---|
| 89 | 93 | static unsigned int tcp_metrics_hash_log __read_mostly; |
|---|
| 90 | 94 | |
|---|
| 91 | 95 | static DEFINE_SPINLOCK(tcp_metrics_lock); |
|---|
| 96 | +static DEFINE_SEQLOCK(fastopen_seqlock); |
|---|
| 92 | 97 | |
|---|
| 93 | 98 | static void tcpm_suck_dst(struct tcp_metrics_block *tm, |
|---|
| 94 | 99 | const struct dst_entry *dst, |
|---|
| .. | .. |
|---|
| 97 | 102 | u32 msval; |
|---|
| 98 | 103 | u32 val; |
|---|
| 99 | 104 | |
|---|
| 100 | | - tm->tcpm_stamp = jiffies; |
|---|
| 105 | + WRITE_ONCE(tm->tcpm_stamp, jiffies); |
|---|
| 101 | 106 | |
|---|
| 102 | 107 | val = 0; |
|---|
| 103 | 108 | if (dst_metric_locked(dst, RTAX_RTT)) |
|---|
| .. | .. |
|---|
| 110 | 115 | val |= 1 << TCP_METRIC_CWND; |
|---|
| 111 | 116 | if (dst_metric_locked(dst, RTAX_REORDERING)) |
|---|
| 112 | 117 | val |= 1 << TCP_METRIC_REORDERING; |
|---|
| 113 | | - tm->tcpm_lock = val; |
|---|
| 118 | + /* Paired with READ_ONCE() in tcp_metric_locked() */ |
|---|
| 119 | + WRITE_ONCE(tm->tcpm_lock, val); |
|---|
| 114 | 120 | |
|---|
| 115 | 121 | msval = dst_metric_raw(dst, RTAX_RTT); |
|---|
| 116 | | - tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; |
|---|
| 122 | + tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC); |
|---|
| 117 | 123 | |
|---|
| 118 | 124 | msval = dst_metric_raw(dst, RTAX_RTTVAR); |
|---|
| 119 | | - tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; |
|---|
| 120 | | - tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); |
|---|
| 121 | | - tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); |
|---|
| 122 | | - tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); |
|---|
| 125 | + tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC); |
|---|
| 126 | + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, |
|---|
| 127 | + dst_metric_raw(dst, RTAX_SSTHRESH)); |
|---|
| 128 | + tcp_metric_set(tm, TCP_METRIC_CWND, |
|---|
| 129 | + dst_metric_raw(dst, RTAX_CWND)); |
|---|
| 130 | + tcp_metric_set(tm, TCP_METRIC_REORDERING, |
|---|
| 131 | + dst_metric_raw(dst, RTAX_REORDERING)); |
|---|
| 123 | 132 | if (fastopen_clear) { |
|---|
| 133 | + write_seqlock(&fastopen_seqlock); |
|---|
| 124 | 134 | tm->tcpm_fastopen.mss = 0; |
|---|
| 125 | 135 | tm->tcpm_fastopen.syn_loss = 0; |
|---|
| 126 | 136 | tm->tcpm_fastopen.try_exp = 0; |
|---|
| 127 | 137 | tm->tcpm_fastopen.cookie.exp = false; |
|---|
| 128 | 138 | tm->tcpm_fastopen.cookie.len = 0; |
|---|
| 139 | + write_sequnlock(&fastopen_seqlock); |
|---|
| 129 | 140 | } |
|---|
| 130 | 141 | } |
|---|
| 131 | 142 | |
|---|
| 132 | 143 | #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) |
|---|
| 133 | 144 | |
|---|
| 134 | | -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) |
|---|
| 145 | +static void tcpm_check_stamp(struct tcp_metrics_block *tm, |
|---|
| 146 | + const struct dst_entry *dst) |
|---|
| 135 | 147 | { |
|---|
| 136 | | - if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) |
|---|
| 148 | + unsigned long limit; |
|---|
| 149 | + |
|---|
| 150 | + if (!tm) |
|---|
| 151 | + return; |
|---|
| 152 | + limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT; |
|---|
| 153 | + if (unlikely(time_after(jiffies, limit))) |
|---|
| 137 | 154 | tcpm_suck_dst(tm, dst, false); |
|---|
| 138 | 155 | } |
|---|
| 139 | 156 | |
|---|
| .. | .. |
|---|
| 174 | 191 | oldest = deref_locked(tcp_metrics_hash[hash].chain); |
|---|
| 175 | 192 | for (tm = deref_locked(oldest->tcpm_next); tm; |
|---|
| 176 | 193 | tm = deref_locked(tm->tcpm_next)) { |
|---|
| 177 | | - if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) |
|---|
| 194 | + if (time_before(READ_ONCE(tm->tcpm_stamp), |
|---|
| 195 | + READ_ONCE(oldest->tcpm_stamp))) |
|---|
| 178 | 196 | oldest = tm; |
|---|
| 179 | 197 | } |
|---|
| 180 | 198 | tm = oldest; |
|---|
| 181 | 199 | } else { |
|---|
| 182 | | - tm = kmalloc(sizeof(*tm), GFP_ATOMIC); |
|---|
| 200 | + tm = kzalloc(sizeof(*tm), GFP_ATOMIC); |
|---|
| 183 | 201 | if (!tm) |
|---|
| 184 | 202 | goto out_unlock; |
|---|
| 185 | 203 | } |
|---|
| 186 | | - write_pnet(&tm->tcpm_net, net); |
|---|
| 204 | + /* Paired with the READ_ONCE() in tm_net() */ |
|---|
| 205 | + WRITE_ONCE(tm->tcpm_net, net); |
|---|
| 206 | + |
|---|
| 187 | 207 | tm->tcpm_saddr = *saddr; |
|---|
| 188 | 208 | tm->tcpm_daddr = *daddr; |
|---|
| 189 | 209 | |
|---|
| 190 | | - tcpm_suck_dst(tm, dst, true); |
|---|
| 210 | + tcpm_suck_dst(tm, dst, reclaim); |
|---|
| 191 | 211 | |
|---|
| 192 | 212 | if (likely(!reclaim)) { |
|---|
| 193 | 213 | tm->tcpm_next = tcp_metrics_hash[hash].chain; |
|---|
| .. | .. |
|---|
| 329 | 349 | int m; |
|---|
| 330 | 350 | |
|---|
| 331 | 351 | sk_dst_confirm(sk); |
|---|
| 332 | | - if (net->ipv4.sysctl_tcp_nometrics_save || !dst) |
|---|
| 352 | + if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst) |
|---|
| 333 | 353 | return; |
|---|
| 334 | 354 | |
|---|
| 335 | 355 | rcu_read_lock(); |
|---|
| .. | .. |
|---|
| 385 | 405 | |
|---|
| 386 | 406 | if (tcp_in_initial_slowstart(tp)) { |
|---|
| 387 | 407 | /* Slow start still did not finish. */ |
|---|
| 388 | | - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { |
|---|
| 408 | + if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && |
|---|
| 409 | + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { |
|---|
| 389 | 410 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); |
|---|
| 390 | 411 | if (val && (tp->snd_cwnd >> 1) > val) |
|---|
| 391 | 412 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, |
|---|
| .. | .. |
|---|
| 400 | 421 | } else if (!tcp_in_slow_start(tp) && |
|---|
| 401 | 422 | icsk->icsk_ca_state == TCP_CA_Open) { |
|---|
| 402 | 423 | /* Cong. avoidance phase, cwnd is reliable. */ |
|---|
| 403 | | - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) |
|---|
| 424 | + if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && |
|---|
| 425 | + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) |
|---|
| 404 | 426 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, |
|---|
| 405 | 427 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); |
|---|
| 406 | 428 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { |
|---|
| .. | .. |
|---|
| 416 | 438 | tcp_metric_set(tm, TCP_METRIC_CWND, |
|---|
| 417 | 439 | (val + tp->snd_ssthresh) >> 1); |
|---|
| 418 | 440 | } |
|---|
| 419 | | - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { |
|---|
| 441 | + if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && |
|---|
| 442 | + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { |
|---|
| 420 | 443 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); |
|---|
| 421 | 444 | if (val && tp->snd_ssthresh > val) |
|---|
| 422 | 445 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, |
|---|
| .. | .. |
|---|
| 425 | 448 | if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { |
|---|
| 426 | 449 | val = tcp_metric_get(tm, TCP_METRIC_REORDERING); |
|---|
| 427 | 450 | if (val < tp->reordering && |
|---|
| 428 | | - tp->reordering != net->ipv4.sysctl_tcp_reordering) |
|---|
| 451 | + tp->reordering != |
|---|
| 452 | + READ_ONCE(net->ipv4.sysctl_tcp_reordering)) |
|---|
| 429 | 453 | tcp_metric_set(tm, TCP_METRIC_REORDERING, |
|---|
| 430 | 454 | tp->reordering); |
|---|
| 431 | 455 | } |
|---|
| 432 | 456 | } |
|---|
| 433 | | - tm->tcpm_stamp = jiffies; |
|---|
| 457 | + WRITE_ONCE(tm->tcpm_stamp, jiffies); |
|---|
| 434 | 458 | out_unlock: |
|---|
| 435 | 459 | rcu_read_unlock(); |
|---|
| 436 | 460 | } |
|---|
| .. | .. |
|---|
| 441 | 465 | { |
|---|
| 442 | 466 | struct dst_entry *dst = __sk_dst_get(sk); |
|---|
| 443 | 467 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 468 | + struct net *net = sock_net(sk); |
|---|
| 444 | 469 | struct tcp_metrics_block *tm; |
|---|
| 445 | 470 | u32 val, crtt = 0; /* cached RTT scaled by 8 */ |
|---|
| 446 | 471 | |
|---|
| .. | .. |
|---|
| 458 | 483 | if (tcp_metric_locked(tm, TCP_METRIC_CWND)) |
|---|
| 459 | 484 | tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); |
|---|
| 460 | 485 | |
|---|
| 461 | | - val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); |
|---|
| 486 | + val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ? |
|---|
| 487 | + 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); |
|---|
| 462 | 488 | if (val) { |
|---|
| 463 | 489 | tp->snd_ssthresh = val; |
|---|
| 464 | 490 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) |
|---|
| .. | .. |
|---|
| 512 | 538 | |
|---|
| 513 | 539 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; |
|---|
| 514 | 540 | } |
|---|
| 515 | | - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
|---|
| 516 | | - * retransmitted. In light of RFC6298 more aggressive 1sec |
|---|
| 517 | | - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK |
|---|
| 518 | | - * retransmission has occurred. |
|---|
| 519 | | - */ |
|---|
| 520 | | - if (tp->total_retrans > 1) |
|---|
| 521 | | - tp->snd_cwnd = 1; |
|---|
| 522 | | - else |
|---|
| 523 | | - tp->snd_cwnd = tcp_init_cwnd(tp, dst); |
|---|
| 524 | | - tp->snd_cwnd_stamp = tcp_jiffies32; |
|---|
| 525 | 541 | } |
|---|
| 526 | 542 | |
|---|
| 527 | 543 | bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) |
|---|
| .. | .. |
|---|
| 542 | 558 | |
|---|
| 543 | 559 | return ret; |
|---|
| 544 | 560 | } |
|---|
| 545 | | - |
|---|
| 546 | | -static DEFINE_SEQLOCK(fastopen_seqlock); |
|---|
| 547 | 561 | |
|---|
| 548 | 562 | void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, |
|---|
| 549 | 563 | struct tcp_fastopen_cookie *cookie) |
|---|
| .. | .. |
|---|
| 651 | 665 | } |
|---|
| 652 | 666 | |
|---|
| 653 | 667 | if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, |
|---|
| 654 | | - jiffies - tm->tcpm_stamp, |
|---|
| 668 | + jiffies - READ_ONCE(tm->tcpm_stamp), |
|---|
| 655 | 669 | TCP_METRICS_ATTR_PAD) < 0) |
|---|
| 656 | 670 | goto nla_put_failure; |
|---|
| 657 | 671 | |
|---|
| 658 | 672 | { |
|---|
| 659 | 673 | int n = 0; |
|---|
| 660 | 674 | |
|---|
| 661 | | - nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); |
|---|
| 675 | + nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS); |
|---|
| 662 | 676 | if (!nest) |
|---|
| 663 | 677 | goto nla_put_failure; |
|---|
| 664 | 678 | for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { |
|---|
| 665 | | - u32 val = tm->tcpm_vals[i]; |
|---|
| 679 | + u32 val = tcp_metric_get(tm, i); |
|---|
| 666 | 680 | |
|---|
| 667 | 681 | if (!val) |
|---|
| 668 | 682 | continue; |
|---|
| .. | .. |
|---|
| 948 | 962 | return 0; |
|---|
| 949 | 963 | } |
|---|
| 950 | 964 | |
|---|
| 951 | | -static const struct genl_ops tcp_metrics_nl_ops[] = { |
|---|
| 965 | +static const struct genl_small_ops tcp_metrics_nl_ops[] = { |
|---|
| 952 | 966 | { |
|---|
| 953 | 967 | .cmd = TCP_METRICS_CMD_GET, |
|---|
| 968 | + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, |
|---|
| 954 | 969 | .doit = tcp_metrics_nl_cmd_get, |
|---|
| 955 | 970 | .dumpit = tcp_metrics_nl_dump, |
|---|
| 956 | | - .policy = tcp_metrics_nl_policy, |
|---|
| 957 | 971 | }, |
|---|
| 958 | 972 | { |
|---|
| 959 | 973 | .cmd = TCP_METRICS_CMD_DEL, |
|---|
| 974 | + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, |
|---|
| 960 | 975 | .doit = tcp_metrics_nl_cmd_del, |
|---|
| 961 | | - .policy = tcp_metrics_nl_policy, |
|---|
| 962 | 976 | .flags = GENL_ADMIN_PERM, |
|---|
| 963 | 977 | }, |
|---|
| 964 | 978 | }; |
|---|
| .. | .. |
|---|
| 968 | 982 | .name = TCP_METRICS_GENL_NAME, |
|---|
| 969 | 983 | .version = TCP_METRICS_GENL_VERSION, |
|---|
| 970 | 984 | .maxattr = TCP_METRICS_ATTR_MAX, |
|---|
| 985 | + .policy = tcp_metrics_nl_policy, |
|---|
| 971 | 986 | .netnsok = true, |
|---|
| 972 | 987 | .module = THIS_MODULE, |
|---|
| 973 | | - .ops = tcp_metrics_nl_ops, |
|---|
| 974 | | - .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops), |
|---|
| 988 | + .small_ops = tcp_metrics_nl_ops, |
|---|
| 989 | + .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), |
|---|
| 975 | 990 | }; |
|---|
| 976 | 991 | |
|---|
| 977 | 992 | static unsigned int tcpmhash_entries; |
|---|
| .. | .. |
|---|
| 1000 | 1015 | |
|---|
| 1001 | 1016 | slots = tcpmhash_entries; |
|---|
| 1002 | 1017 | if (!slots) { |
|---|
| 1003 | | - if (totalram_pages >= 128 * 1024) |
|---|
| 1018 | + if (totalram_pages() >= 128 * 1024) |
|---|
| 1004 | 1019 | slots = 16 * 1024; |
|---|
| 1005 | 1020 | else |
|---|
| 1006 | 1021 | slots = 8 * 1024; |
|---|