.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
---|
3 | 4 | * operating system. INET is implemented using the BSD Socket |
---|
.. | .. |
---|
205 | 206 | * Hirokazu Takahashi : Use copy_from_user() instead of |
---|
206 | 207 | * csum_and_copy_from_user() if possible. |
---|
207 | 208 | * |
---|
208 | | - * This program is free software; you can redistribute it and/or |
---|
209 | | - * modify it under the terms of the GNU General Public License |
---|
210 | | - * as published by the Free Software Foundation; either version |
---|
211 | | - * 2 of the License, or(at your option) any later version. |
---|
212 | | - * |
---|
213 | 209 | * Description of States: |
---|
214 | 210 | * |
---|
215 | 211 | * TCP_SYN_SENT sent a connection request, waiting for ack |
---|
.. | .. |
---|
262 | 258 | #include <linux/net.h> |
---|
263 | 259 | #include <linux/socket.h> |
---|
264 | 260 | #include <linux/random.h> |
---|
265 | | -#include <linux/bootmem.h> |
---|
| 261 | +#include <linux/memblock.h> |
---|
266 | 262 | #include <linux/highmem.h> |
---|
267 | 263 | #include <linux/swap.h> |
---|
268 | 264 | #include <linux/cache.h> |
---|
.. | .. |
---|
275 | 271 | #include <net/icmp.h> |
---|
276 | 272 | #include <net/inet_common.h> |
---|
277 | 273 | #include <net/tcp.h> |
---|
| 274 | +#include <net/mptcp.h> |
---|
278 | 275 | #include <net/xfrm.h> |
---|
279 | 276 | #include <net/ip.h> |
---|
280 | 277 | #include <net/sock.h> |
---|
.. | .. |
---|
282 | 279 | #include <linux/uaccess.h> |
---|
283 | 280 | #include <asm/ioctls.h> |
---|
284 | 281 | #include <net/busy_poll.h> |
---|
| 282 | + |
---|
| 283 | +#include <trace/hooks/ipv4.h> |
---|
285 | 284 | |
---|
286 | 285 | struct percpu_counter tcp_orphan_count; |
---|
287 | 286 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
---|
.. | .. |
---|
320 | 319 | */ |
---|
321 | 320 | unsigned long tcp_memory_pressure __read_mostly; |
---|
322 | 321 | EXPORT_SYMBOL_GPL(tcp_memory_pressure); |
---|
| 322 | + |
---|
| 323 | +DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); |
---|
| 324 | +EXPORT_SYMBOL(tcp_rx_skb_cache_key); |
---|
| 325 | + |
---|
| 326 | +DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); |
---|
323 | 327 | |
---|
324 | 328 | void tcp_enter_memory_pressure(struct sock *sk) |
---|
325 | 329 | { |
---|
.. | .. |
---|
416 | 420 | INIT_LIST_HEAD(&tp->tsorted_sent_queue); |
---|
417 | 421 | |
---|
418 | 422 | icsk->icsk_rto = TCP_TIMEOUT_INIT; |
---|
| 423 | + icsk->icsk_rto_min = TCP_RTO_MIN; |
---|
| 424 | + icsk->icsk_delack_max = TCP_DELACK_MAX; |
---|
419 | 425 | tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); |
---|
420 | 426 | minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); |
---|
421 | 427 | |
---|
.. | .. |
---|
428 | 434 | |
---|
429 | 435 | /* There's a bubble in the pipe until at least the first ACK. */ |
---|
430 | 436 | tp->app_limited = ~0U; |
---|
| 437 | + tp->rate_app_limited = 1; |
---|
431 | 438 | |
---|
432 | 439 | /* See draft-stevens-tcpca-spec-01 for discussion of the |
---|
433 | 440 | * initialization of these values. |
---|
.. | .. |
---|
436 | 443 | tp->snd_cwnd_clamp = ~0; |
---|
437 | 444 | tp->mss_cache = TCP_MSS_DEFAULT; |
---|
438 | 445 | |
---|
439 | | - tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; |
---|
| 446 | + tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); |
---|
440 | 447 | tcp_assign_congestion_control(sk); |
---|
441 | 448 | |
---|
442 | 449 | tp->tsoffset = 0; |
---|
443 | 450 | tp->rack.reo_wnd_steps = 1; |
---|
444 | | - |
---|
445 | | - sk->sk_state = TCP_CLOSE; |
---|
446 | 451 | |
---|
447 | 452 | sk->sk_write_space = sk_stream_write_space; |
---|
448 | 453 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); |
---|
449 | 454 | |
---|
450 | 455 | icsk->icsk_sync_mss = tcp_sync_mss; |
---|
451 | 456 | |
---|
452 | | - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; |
---|
453 | | - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; |
---|
| 457 | + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); |
---|
| 458 | + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); |
---|
454 | 459 | |
---|
455 | 460 | sk_sockets_allocated_inc(sk); |
---|
456 | 461 | sk->sk_route_forced_caps = NETIF_F_GSO; |
---|
457 | 462 | } |
---|
458 | 463 | EXPORT_SYMBOL(tcp_init_sock); |
---|
459 | | - |
---|
460 | | -void tcp_init_transfer(struct sock *sk, int bpf_op) |
---|
461 | | -{ |
---|
462 | | - struct inet_connection_sock *icsk = inet_csk(sk); |
---|
463 | | - |
---|
464 | | - tcp_mtup_init(sk); |
---|
465 | | - icsk->icsk_af_ops->rebuild_header(sk); |
---|
466 | | - tcp_init_metrics(sk); |
---|
467 | | - tcp_call_bpf(sk, bpf_op, 0, NULL); |
---|
468 | | - tcp_init_congestion_control(sk); |
---|
469 | | - tcp_init_buffer_space(sk); |
---|
470 | | -} |
---|
471 | 464 | |
---|
472 | 465 | static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) |
---|
473 | 466 | { |
---|
.. | .. |
---|
515 | 508 | __poll_t mask; |
---|
516 | 509 | struct sock *sk = sock->sk; |
---|
517 | 510 | const struct tcp_sock *tp = tcp_sk(sk); |
---|
| 511 | + u8 shutdown; |
---|
518 | 512 | int state; |
---|
519 | 513 | |
---|
520 | 514 | sock_poll_wait(file, sock, wait); |
---|
.. | .. |
---|
557 | 551 | * NOTE. Check for TCP_CLOSE is added. The goal is to prevent |
---|
558 | 552 | * blocking on fresh not-connected or disconnected socket. --ANK |
---|
559 | 553 | */ |
---|
560 | | - if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) |
---|
| 554 | + shutdown = READ_ONCE(sk->sk_shutdown); |
---|
| 555 | + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) |
---|
561 | 556 | mask |= EPOLLHUP; |
---|
562 | | - if (sk->sk_shutdown & RCV_SHUTDOWN) |
---|
| 557 | + if (shutdown & RCV_SHUTDOWN) |
---|
563 | 558 | mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; |
---|
564 | 559 | |
---|
565 | 560 | /* Connected or passive Fast Open socket? */ |
---|
566 | 561 | if (state != TCP_SYN_SENT && |
---|
567 | | - (state != TCP_SYN_RECV || tp->fastopen_rsk)) { |
---|
| 562 | + (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { |
---|
568 | 563 | int target = sock_rcvlowat(sk, 0, INT_MAX); |
---|
569 | 564 | |
---|
570 | | - if (tp->urg_seq == READ_ONCE(tp->copied_seq) && |
---|
| 565 | + if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && |
---|
571 | 566 | !sock_flag(sk, SOCK_URGINLINE) && |
---|
572 | 567 | tp->urg_data) |
---|
573 | 568 | target++; |
---|
.. | .. |
---|
575 | 570 | if (tcp_stream_is_readable(tp, target, sk)) |
---|
576 | 571 | mask |= EPOLLIN | EPOLLRDNORM; |
---|
577 | 572 | |
---|
578 | | - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { |
---|
579 | | - if (sk_stream_is_writeable(sk)) { |
---|
| 573 | + if (!(shutdown & SEND_SHUTDOWN)) { |
---|
| 574 | + if (__sk_stream_is_writeable(sk, 1)) { |
---|
580 | 575 | mask |= EPOLLOUT | EPOLLWRNORM; |
---|
581 | 576 | } else { /* send SIGIO later */ |
---|
582 | 577 | sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); |
---|
.. | .. |
---|
588 | 583 | * pairs with the input side. |
---|
589 | 584 | */ |
---|
590 | 585 | smp_mb__after_atomic(); |
---|
591 | | - if (sk_stream_is_writeable(sk)) |
---|
| 586 | + if (__sk_stream_is_writeable(sk, 1)) |
---|
592 | 587 | mask |= EPOLLOUT | EPOLLWRNORM; |
---|
593 | 588 | } |
---|
594 | 589 | } else |
---|
.. | .. |
---|
628 | 623 | unlock_sock_fast(sk, slow); |
---|
629 | 624 | break; |
---|
630 | 625 | case SIOCATMARK: |
---|
631 | | - answ = tp->urg_data && tp->urg_seq == READ_ONCE(tp->copied_seq); |
---|
| 626 | + answ = tp->urg_data && |
---|
| 627 | + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); |
---|
632 | 628 | break; |
---|
633 | 629 | case SIOCOUTQ: |
---|
634 | 630 | if (sk->sk_state == TCP_LISTEN) |
---|
.. | .. |
---|
646 | 642 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) |
---|
647 | 643 | answ = 0; |
---|
648 | 644 | else |
---|
649 | | - answ = READ_ONCE(tp->write_seq) - tp->snd_nxt; |
---|
| 645 | + answ = READ_ONCE(tp->write_seq) - |
---|
| 646 | + READ_ONCE(tp->snd_nxt); |
---|
650 | 647 | break; |
---|
651 | 648 | default: |
---|
652 | 649 | return -ENOIOCTLCMD; |
---|
.. | .. |
---|
678 | 675 | tcb->sacked = 0; |
---|
679 | 676 | __skb_header_release(skb); |
---|
680 | 677 | tcp_add_write_queue_tail(sk, skb); |
---|
681 | | - sk->sk_wmem_queued += skb->truesize; |
---|
| 678 | + sk_wmem_queued_add(sk, skb->truesize); |
---|
682 | 679 | sk_mem_charge(sk, skb->truesize); |
---|
683 | 680 | if (tp->nonagle & TCP_NAGLE_PUSH) |
---|
684 | 681 | tp->nonagle &= ~TCP_NAGLE_PUSH; |
---|
.. | .. |
---|
706 | 703 | int size_goal) |
---|
707 | 704 | { |
---|
708 | 705 | return skb->len < size_goal && |
---|
709 | | - sock_net(sk)->ipv4.sysctl_tcp_autocorking && |
---|
| 706 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && |
---|
710 | 707 | !tcp_rtx_queue_empty(sk) && |
---|
711 | 708 | refcount_read(&sk->sk_wmem_alloc) > skb->truesize; |
---|
712 | 709 | } |
---|
713 | 710 | |
---|
714 | | -static void tcp_push(struct sock *sk, int flags, int mss_now, |
---|
715 | | - int nonagle, int size_goal) |
---|
| 711 | +void tcp_push(struct sock *sk, int flags, int mss_now, |
---|
| 712 | + int nonagle, int size_goal) |
---|
716 | 713 | { |
---|
717 | 714 | struct tcp_sock *tp = tcp_sk(sk); |
---|
718 | 715 | struct sk_buff *skb; |
---|
.. | .. |
---|
875 | 872 | { |
---|
876 | 873 | struct sk_buff *skb; |
---|
877 | 874 | |
---|
| 875 | + if (likely(!size)) { |
---|
| 876 | + skb = sk->sk_tx_skb_cache; |
---|
| 877 | + if (skb) { |
---|
| 878 | + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); |
---|
| 879 | + sk->sk_tx_skb_cache = NULL; |
---|
| 880 | + pskb_trim(skb, 0); |
---|
| 881 | + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); |
---|
| 882 | + skb_shinfo(skb)->tx_flags = 0; |
---|
| 883 | + memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); |
---|
| 884 | + return skb; |
---|
| 885 | + } |
---|
| 886 | + } |
---|
878 | 887 | /* The TCP header must be at least 32-bit aligned. */ |
---|
879 | 888 | size = ALIGN(size, 4); |
---|
880 | 889 | |
---|
.. | .. |
---|
934 | 943 | return max(size_goal, mss_now); |
---|
935 | 944 | } |
---|
936 | 945 | |
---|
937 | | -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) |
---|
| 946 | +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) |
---|
938 | 947 | { |
---|
939 | 948 | int mss_now; |
---|
940 | 949 | |
---|
.. | .. |
---|
969 | 978 | ssize_t copied; |
---|
970 | 979 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
---|
971 | 980 | |
---|
| 981 | + if (IS_ENABLED(CONFIG_DEBUG_VM) && |
---|
| 982 | + WARN_ONCE(!sendpage_ok(page), |
---|
| 983 | + "page must not be a Slab one and have page_count > 0")) |
---|
| 984 | + return -EINVAL; |
---|
| 985 | + |
---|
972 | 986 | /* Wait for a connection to finish. One exception is TCP Fast Open |
---|
973 | 987 | * (passive side) where data is allowed to be sent before a connection |
---|
974 | 988 | * is fully established. |
---|
.. | .. |
---|
998 | 1012 | !tcp_skb_can_collapse_to(skb)) { |
---|
999 | 1013 | new_segment: |
---|
1000 | 1014 | if (!sk_stream_memory_free(sk)) |
---|
1001 | | - goto wait_for_sndbuf; |
---|
| 1015 | + goto wait_for_space; |
---|
1002 | 1016 | |
---|
1003 | 1017 | skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, |
---|
1004 | 1018 | tcp_rtx_and_write_queues_empty(sk)); |
---|
1005 | 1019 | if (!skb) |
---|
1006 | | - goto wait_for_memory; |
---|
| 1020 | + goto wait_for_space; |
---|
1007 | 1021 | |
---|
| 1022 | +#ifdef CONFIG_TLS_DEVICE |
---|
| 1023 | + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); |
---|
| 1024 | +#endif |
---|
1008 | 1025 | skb_entail(sk, skb); |
---|
1009 | 1026 | copy = size_goal; |
---|
1010 | 1027 | } |
---|
.. | .. |
---|
1019 | 1036 | goto new_segment; |
---|
1020 | 1037 | } |
---|
1021 | 1038 | if (!sk_wmem_schedule(sk, copy)) |
---|
1022 | | - goto wait_for_memory; |
---|
| 1039 | + goto wait_for_space; |
---|
1023 | 1040 | |
---|
1024 | 1041 | if (can_coalesce) { |
---|
1025 | 1042 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); |
---|
.. | .. |
---|
1034 | 1051 | skb->len += copy; |
---|
1035 | 1052 | skb->data_len += copy; |
---|
1036 | 1053 | skb->truesize += copy; |
---|
1037 | | - sk->sk_wmem_queued += copy; |
---|
| 1054 | + sk_wmem_queued_add(sk, copy); |
---|
1038 | 1055 | sk_mem_charge(sk, copy); |
---|
1039 | 1056 | skb->ip_summed = CHECKSUM_PARTIAL; |
---|
1040 | 1057 | WRITE_ONCE(tp->write_seq, tp->write_seq + copy); |
---|
.. | .. |
---|
1060 | 1077 | tcp_push_one(sk, mss_now); |
---|
1061 | 1078 | continue; |
---|
1062 | 1079 | |
---|
1063 | | -wait_for_sndbuf: |
---|
| 1080 | +wait_for_space: |
---|
1064 | 1081 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
---|
1065 | | -wait_for_memory: |
---|
1066 | 1082 | tcp_push(sk, flags & ~MSG_MORE, mss_now, |
---|
1067 | 1083 | TCP_NAGLE_PUSH, size_goal); |
---|
1068 | 1084 | |
---|
.. | .. |
---|
1120 | 1136 | } |
---|
1121 | 1137 | EXPORT_SYMBOL(tcp_sendpage); |
---|
1122 | 1138 | |
---|
1123 | | -/* Do not bother using a page frag for very small frames. |
---|
1124 | | - * But use this heuristic only for the first skb in write queue. |
---|
1125 | | - * |
---|
1126 | | - * Having no payload in skb->head allows better SACK shifting |
---|
1127 | | - * in tcp_shift_skb_data(), reducing sack/rack overhead, because |
---|
1128 | | - * write queue has less skbs. |
---|
1129 | | - * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB. |
---|
1130 | | - * This also speeds up tso_fragment(), since it wont fallback |
---|
1131 | | - * to tcp_fragment(). |
---|
1132 | | - */ |
---|
1133 | | -static int linear_payload_sz(bool first_skb) |
---|
1134 | | -{ |
---|
1135 | | - if (first_skb) |
---|
1136 | | - return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); |
---|
1137 | | - return 0; |
---|
1138 | | -} |
---|
1139 | | - |
---|
1140 | | -static int select_size(bool first_skb, bool zc) |
---|
1141 | | -{ |
---|
1142 | | - if (zc) |
---|
1143 | | - return 0; |
---|
1144 | | - return linear_payload_sz(first_skb); |
---|
1145 | | -} |
---|
1146 | | - |
---|
1147 | 1139 | void tcp_free_fastopen_req(struct tcp_sock *tp) |
---|
1148 | 1140 | { |
---|
1149 | 1141 | if (tp->fastopen_req) { |
---|
.. | .. |
---|
1153 | 1145 | } |
---|
1154 | 1146 | |
---|
1155 | 1147 | static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, |
---|
1156 | | - int *copied, size_t size) |
---|
| 1148 | + int *copied, size_t size, |
---|
| 1149 | + struct ubuf_info *uarg) |
---|
1157 | 1150 | { |
---|
1158 | 1151 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1159 | 1152 | struct inet_sock *inet = inet_sk(sk); |
---|
1160 | 1153 | struct sockaddr *uaddr = msg->msg_name; |
---|
1161 | 1154 | int err, flags; |
---|
1162 | 1155 | |
---|
1163 | | - if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || |
---|
| 1156 | + if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & |
---|
| 1157 | + TFO_CLIENT_ENABLE) || |
---|
1164 | 1158 | (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && |
---|
1165 | 1159 | uaddr->sa_family == AF_UNSPEC)) |
---|
1166 | 1160 | return -EOPNOTSUPP; |
---|
.. | .. |
---|
1173 | 1167 | return -ENOBUFS; |
---|
1174 | 1168 | tp->fastopen_req->data = msg; |
---|
1175 | 1169 | tp->fastopen_req->size = size; |
---|
| 1170 | + tp->fastopen_req->uarg = uarg; |
---|
1176 | 1171 | |
---|
1177 | 1172 | if (inet->defer_connect) { |
---|
1178 | 1173 | err = tcp_connect(sk); |
---|
.. | .. |
---|
1205 | 1200 | struct sockcm_cookie sockc; |
---|
1206 | 1201 | int flags, err, copied = 0; |
---|
1207 | 1202 | int mss_now = 0, size_goal, copied_syn = 0; |
---|
1208 | | - bool process_backlog = false; |
---|
| 1203 | + int process_backlog = 0; |
---|
1209 | 1204 | bool zc = false; |
---|
1210 | 1205 | long timeo; |
---|
1211 | 1206 | |
---|
| 1207 | + trace_android_rvh_tcp_sendmsg_locked(sk, size); |
---|
1212 | 1208 | flags = msg->msg_flags; |
---|
1213 | 1209 | |
---|
1214 | 1210 | if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { |
---|
1215 | | - if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { |
---|
1216 | | - err = -EINVAL; |
---|
1217 | | - goto out_err; |
---|
1218 | | - } |
---|
1219 | | - |
---|
1220 | 1211 | skb = tcp_write_queue_tail(sk); |
---|
1221 | 1212 | uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); |
---|
1222 | 1213 | if (!uarg) { |
---|
.. | .. |
---|
1231 | 1222 | |
---|
1232 | 1223 | if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && |
---|
1233 | 1224 | !tp->repair) { |
---|
1234 | | - err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); |
---|
| 1225 | + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); |
---|
1235 | 1226 | if (err == -EINPROGRESS && copied_syn > 0) |
---|
1236 | 1227 | goto out; |
---|
1237 | 1228 | else if (err) |
---|
.. | .. |
---|
1297 | 1288 | |
---|
1298 | 1289 | if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { |
---|
1299 | 1290 | bool first_skb; |
---|
1300 | | - int linear; |
---|
1301 | 1291 | |
---|
1302 | 1292 | new_segment: |
---|
1303 | 1293 | if (!sk_stream_memory_free(sk)) |
---|
1304 | | - goto wait_for_sndbuf; |
---|
| 1294 | + goto wait_for_space; |
---|
1305 | 1295 | |
---|
1306 | | - if (process_backlog && sk_flush_backlog(sk)) { |
---|
1307 | | - process_backlog = false; |
---|
1308 | | - goto restart; |
---|
| 1296 | + if (unlikely(process_backlog >= 16)) { |
---|
| 1297 | + process_backlog = 0; |
---|
| 1298 | + if (sk_flush_backlog(sk)) |
---|
| 1299 | + goto restart; |
---|
1309 | 1300 | } |
---|
1310 | 1301 | first_skb = tcp_rtx_and_write_queues_empty(sk); |
---|
1311 | | - linear = select_size(first_skb, zc); |
---|
1312 | | - skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, |
---|
| 1302 | + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, |
---|
1313 | 1303 | first_skb); |
---|
1314 | 1304 | if (!skb) |
---|
1315 | | - goto wait_for_memory; |
---|
| 1305 | + goto wait_for_space; |
---|
1316 | 1306 | |
---|
1317 | | - process_backlog = true; |
---|
| 1307 | + process_backlog++; |
---|
1318 | 1308 | skb->ip_summed = CHECKSUM_PARTIAL; |
---|
1319 | 1309 | |
---|
1320 | 1310 | skb_entail(sk, skb); |
---|
1321 | 1311 | copy = size_goal; |
---|
1322 | 1312 | |
---|
1323 | 1313 | /* All packets are restored as if they have |
---|
1324 | | - * already been sent. skb_mstamp isn't set to |
---|
| 1314 | + * already been sent. skb_mstamp_ns isn't set to |
---|
1325 | 1315 | * avoid wrong rtt estimation. |
---|
1326 | 1316 | */ |
---|
1327 | 1317 | if (tp->repair) |
---|
.. | .. |
---|
1345 | 1335 | struct page_frag *pfrag = sk_page_frag(sk); |
---|
1346 | 1336 | |
---|
1347 | 1337 | if (!sk_page_frag_refill(sk, pfrag)) |
---|
1348 | | - goto wait_for_memory; |
---|
| 1338 | + goto wait_for_space; |
---|
1349 | 1339 | |
---|
1350 | 1340 | if (!skb_can_coalesce(skb, i, pfrag->page, |
---|
1351 | 1341 | pfrag->offset)) { |
---|
.. | .. |
---|
1359 | 1349 | copy = min_t(int, copy, pfrag->size - pfrag->offset); |
---|
1360 | 1350 | |
---|
1361 | 1351 | if (!sk_wmem_schedule(sk, copy)) |
---|
1362 | | - goto wait_for_memory; |
---|
| 1352 | + goto wait_for_space; |
---|
1363 | 1353 | |
---|
1364 | 1354 | err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, |
---|
1365 | 1355 | pfrag->page, |
---|
.. | .. |
---|
1378 | 1368 | } |
---|
1379 | 1369 | pfrag->offset += copy; |
---|
1380 | 1370 | } else { |
---|
| 1371 | + if (!sk_wmem_schedule(sk, copy)) |
---|
| 1372 | + goto wait_for_space; |
---|
| 1373 | + |
---|
1381 | 1374 | err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); |
---|
1382 | 1375 | if (err == -EMSGSIZE || err == -EEXIST) { |
---|
1383 | 1376 | tcp_mark_push(tp, skb); |
---|
.. | .. |
---|
1412 | 1405 | tcp_push_one(sk, mss_now); |
---|
1413 | 1406 | continue; |
---|
1414 | 1407 | |
---|
1415 | | -wait_for_sndbuf: |
---|
| 1408 | +wait_for_space: |
---|
1416 | 1409 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
---|
1417 | | -wait_for_memory: |
---|
1418 | 1410 | if (copied) |
---|
1419 | 1411 | tcp_push(sk, flags & ~MSG_MORE, mss_now, |
---|
1420 | 1412 | TCP_NAGLE_PUSH, size_goal); |
---|
.. | .. |
---|
1443 | 1435 | if (copied + copied_syn) |
---|
1444 | 1436 | goto out; |
---|
1445 | 1437 | out_err: |
---|
1446 | | - sock_zerocopy_put_abort(uarg); |
---|
| 1438 | + sock_zerocopy_put_abort(uarg, true); |
---|
1447 | 1439 | err = sk_stream_error(sk, flags, err); |
---|
1448 | 1440 | /* make sure we wake any epoll edge trigger waiter */ |
---|
1449 | 1441 | if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { |
---|
.. | .. |
---|
1546 | 1538 | * calculation of whether or not we must ACK for the sake of |
---|
1547 | 1539 | * a window update. |
---|
1548 | 1540 | */ |
---|
1549 | | -static void tcp_cleanup_rbuf(struct sock *sk, int copied) |
---|
| 1541 | +void tcp_cleanup_rbuf(struct sock *sk, int copied) |
---|
1550 | 1542 | { |
---|
1551 | 1543 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1552 | 1544 | bool time_to_ack = false; |
---|
.. | .. |
---|
1559 | 1551 | |
---|
1560 | 1552 | if (inet_csk_ack_scheduled(sk)) { |
---|
1561 | 1553 | const struct inet_connection_sock *icsk = inet_csk(sk); |
---|
1562 | | - /* Delayed ACKs frequently hit locked sockets during bulk |
---|
1563 | | - * receive. */ |
---|
1564 | | - if (icsk->icsk_ack.blocked || |
---|
1565 | | - /* Once-per-two-segments ACK was not sent by tcp_input.c */ |
---|
| 1554 | + |
---|
| 1555 | + if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ |
---|
1566 | 1556 | tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || |
---|
1567 | 1557 | /* |
---|
1568 | 1558 | * If this read emptied read buffer, we send ACK, if |
---|
.. | .. |
---|
1573 | 1563 | (copied > 0 && |
---|
1574 | 1564 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || |
---|
1575 | 1565 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && |
---|
1576 | | - !icsk->icsk_ack.pingpong)) && |
---|
| 1566 | + !inet_csk_in_pingpong_mode(sk))) && |
---|
1577 | 1567 | !atomic_read(&sk->sk_rmem_alloc))) |
---|
1578 | 1568 | time_to_ack = true; |
---|
1579 | 1569 | } |
---|
.. | .. |
---|
1669 | 1659 | if (!copied) |
---|
1670 | 1660 | copied = used; |
---|
1671 | 1661 | break; |
---|
1672 | | - } else if (used <= len) { |
---|
1673 | | - seq += used; |
---|
1674 | | - copied += used; |
---|
1675 | | - offset += used; |
---|
1676 | 1662 | } |
---|
| 1663 | + if (WARN_ON_ONCE(used > len)) |
---|
| 1664 | + used = len; |
---|
| 1665 | + seq += used; |
---|
| 1666 | + copied += used; |
---|
| 1667 | + offset += used; |
---|
| 1668 | + |
---|
1677 | 1669 | /* If recv_actor drops the lock (e.g. TCP splice |
---|
1678 | 1670 | * receive) the skb pointer might be invalid when |
---|
1679 | 1671 | * getting here: tcp_collapse might have deleted it |
---|
.. | .. |
---|
1725 | 1717 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) |
---|
1726 | 1718 | cap = sk->sk_rcvbuf >> 1; |
---|
1727 | 1719 | else |
---|
1728 | | - cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; |
---|
| 1720 | + cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; |
---|
1729 | 1721 | val = min(val, cap); |
---|
1730 | | - sk->sk_rcvlowat = val ? : 1; |
---|
| 1722 | + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); |
---|
1731 | 1723 | |
---|
1732 | 1724 | /* Check if we need to signal EPOLLIN right now */ |
---|
1733 | 1725 | tcp_data_ready(sk); |
---|
.. | .. |
---|
1737 | 1729 | |
---|
1738 | 1730 | val <<= 1; |
---|
1739 | 1731 | if (val > sk->sk_rcvbuf) { |
---|
1740 | | - sk->sk_rcvbuf = val; |
---|
| 1732 | + WRITE_ONCE(sk->sk_rcvbuf, val); |
---|
1741 | 1733 | tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); |
---|
1742 | 1734 | } |
---|
1743 | 1735 | return 0; |
---|
.. | .. |
---|
1755 | 1747 | return -EPERM; |
---|
1756 | 1748 | vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); |
---|
1757 | 1749 | |
---|
1758 | | - /* Instruct vm_insert_page() to not down_read(mmap_sem) */ |
---|
| 1750 | + /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ |
---|
1759 | 1751 | vma->vm_flags |= VM_MIXEDMAP; |
---|
1760 | 1752 | |
---|
1761 | 1753 | vma->vm_ops = &tcp_vm_ops; |
---|
.. | .. |
---|
1763 | 1755 | } |
---|
1764 | 1756 | EXPORT_SYMBOL(tcp_mmap); |
---|
1765 | 1757 | |
---|
| 1758 | +static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, |
---|
| 1759 | + u32 *offset_frag) |
---|
| 1760 | +{ |
---|
| 1761 | + skb_frag_t *frag; |
---|
| 1762 | + |
---|
| 1763 | + if (unlikely(offset_skb >= skb->len)) |
---|
| 1764 | + return NULL; |
---|
| 1765 | + |
---|
| 1766 | + offset_skb -= skb_headlen(skb); |
---|
| 1767 | + if ((int)offset_skb < 0 || skb_has_frag_list(skb)) |
---|
| 1768 | + return NULL; |
---|
| 1769 | + |
---|
| 1770 | + frag = skb_shinfo(skb)->frags; |
---|
| 1771 | + while (offset_skb) { |
---|
| 1772 | + if (skb_frag_size(frag) > offset_skb) { |
---|
| 1773 | + *offset_frag = offset_skb; |
---|
| 1774 | + return frag; |
---|
| 1775 | + } |
---|
| 1776 | + offset_skb -= skb_frag_size(frag); |
---|
| 1777 | + ++frag; |
---|
| 1778 | + } |
---|
| 1779 | + *offset_frag = 0; |
---|
| 1780 | + return frag; |
---|
| 1781 | +} |
---|
| 1782 | + |
---|
| 1783 | +static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, |
---|
| 1784 | + struct sk_buff *skb, u32 copylen, |
---|
| 1785 | + u32 *offset, u32 *seq) |
---|
| 1786 | +{ |
---|
| 1787 | + unsigned long copy_address = (unsigned long)zc->copybuf_address; |
---|
| 1788 | + struct msghdr msg = {}; |
---|
| 1789 | + struct iovec iov; |
---|
| 1790 | + int err; |
---|
| 1791 | + |
---|
| 1792 | + if (copy_address != zc->copybuf_address) |
---|
| 1793 | + return -EINVAL; |
---|
| 1794 | + |
---|
| 1795 | + err = import_single_range(READ, (void __user *)copy_address, |
---|
| 1796 | + copylen, &iov, &msg.msg_iter); |
---|
| 1797 | + if (err) |
---|
| 1798 | + return err; |
---|
| 1799 | + err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); |
---|
| 1800 | + if (err) |
---|
| 1801 | + return err; |
---|
| 1802 | + zc->recv_skip_hint -= copylen; |
---|
| 1803 | + *offset += copylen; |
---|
| 1804 | + *seq += copylen; |
---|
| 1805 | + return (__s32)copylen; |
---|
| 1806 | +} |
---|
| 1807 | + |
---|
| 1808 | +static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc, |
---|
| 1809 | + struct sock *sk, |
---|
| 1810 | + struct sk_buff *skb, |
---|
| 1811 | + u32 *seq, |
---|
| 1812 | + s32 copybuf_len) |
---|
| 1813 | +{ |
---|
| 1814 | + u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); |
---|
| 1815 | + |
---|
| 1816 | + if (!copylen) |
---|
| 1817 | + return 0; |
---|
| 1818 | + /* skb is null if inq < PAGE_SIZE. */ |
---|
| 1819 | + if (skb) |
---|
| 1820 | + offset = *seq - TCP_SKB_CB(skb)->seq; |
---|
| 1821 | + else |
---|
| 1822 | + skb = tcp_recv_skb(sk, *seq, &offset); |
---|
| 1823 | + |
---|
| 1824 | + zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, |
---|
| 1825 | + seq); |
---|
| 1826 | + return zc->copybuf_len < 0 ? 0 : copylen; |
---|
| 1827 | +} |
---|
| 1828 | + |
---|
| 1829 | +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, |
---|
| 1830 | + struct page **pages, |
---|
| 1831 | + unsigned long pages_to_map, |
---|
| 1832 | + unsigned long *insert_addr, |
---|
| 1833 | + u32 *length_with_pending, |
---|
| 1834 | + u32 *seq, |
---|
| 1835 | + struct tcp_zerocopy_receive *zc) |
---|
| 1836 | +{ |
---|
| 1837 | + unsigned long pages_remaining = pages_to_map; |
---|
| 1838 | + int bytes_mapped; |
---|
| 1839 | + int ret; |
---|
| 1840 | + |
---|
| 1841 | + ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); |
---|
| 1842 | + bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); |
---|
| 1843 | + /* Even if vm_insert_pages fails, it may have partially succeeded in |
---|
| 1844 | + * mapping (some but not all of the pages). |
---|
| 1845 | + */ |
---|
| 1846 | + *seq += bytes_mapped; |
---|
| 1847 | + *insert_addr += bytes_mapped; |
---|
| 1848 | + if (ret) { |
---|
| 1849 | + /* But if vm_insert_pages did fail, we have to unroll some state |
---|
| 1850 | + * we speculatively touched before. |
---|
| 1851 | + */ |
---|
| 1852 | + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; |
---|
| 1853 | + *length_with_pending -= bytes_not_mapped; |
---|
| 1854 | + zc->recv_skip_hint += bytes_not_mapped; |
---|
| 1855 | + } |
---|
| 1856 | + return ret; |
---|
| 1857 | +} |
---|
| 1858 | + |
---|
1766 | 1859 | static int tcp_zerocopy_receive(struct sock *sk, |
---|
1767 | 1860 | struct tcp_zerocopy_receive *zc) |
---|
1768 | 1861 | { |
---|
| 1862 | + u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0; |
---|
1769 | 1863 | unsigned long address = (unsigned long)zc->address; |
---|
| 1864 | + s32 copybuf_len = zc->copybuf_len; |
---|
| 1865 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 1866 | + #define PAGE_BATCH_SIZE 8 |
---|
| 1867 | + struct page *pages[PAGE_BATCH_SIZE]; |
---|
1770 | 1868 | const skb_frag_t *frags = NULL; |
---|
1771 | | - u32 length = 0, seq, offset; |
---|
1772 | 1869 | struct vm_area_struct *vma; |
---|
1773 | 1870 | struct sk_buff *skb = NULL; |
---|
1774 | | - struct tcp_sock *tp; |
---|
| 1871 | + unsigned long pg_idx = 0; |
---|
| 1872 | + unsigned long curr_addr; |
---|
| 1873 | + u32 seq = tp->copied_seq; |
---|
| 1874 | + int inq = tcp_inq(sk); |
---|
1775 | 1875 | int ret; |
---|
| 1876 | + |
---|
| 1877 | + zc->copybuf_len = 0; |
---|
1776 | 1878 | |
---|
1777 | 1879 | if (address & (PAGE_SIZE - 1) || address != zc->address) |
---|
1778 | 1880 | return -EINVAL; |
---|
.. | .. |
---|
1782 | 1884 | |
---|
1783 | 1885 | sock_rps_record_flow(sk); |
---|
1784 | 1886 | |
---|
1785 | | - down_read(¤t->mm->mmap_sem); |
---|
| 1887 | + mmap_read_lock(current->mm); |
---|
1786 | 1888 | |
---|
1787 | 1889 | vma = find_vma(current->mm, address); |
---|
1788 | 1890 | if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { |
---|
1789 | | - up_read(¤t->mm->mmap_sem); |
---|
| 1891 | + mmap_read_unlock(current->mm); |
---|
1790 | 1892 | return -EINVAL; |
---|
1791 | 1893 | } |
---|
1792 | | - zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); |
---|
1793 | | - |
---|
1794 | | - tp = tcp_sk(sk); |
---|
1795 | | - seq = tp->copied_seq; |
---|
1796 | | - zc->length = min_t(u32, zc->length, tcp_inq(sk)); |
---|
1797 | | - zc->length &= ~(PAGE_SIZE - 1); |
---|
1798 | | - |
---|
1799 | | - zap_page_range(vma, address, zc->length); |
---|
1800 | | - |
---|
1801 | | - zc->recv_skip_hint = 0; |
---|
| 1894 | + vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); |
---|
| 1895 | + avail_len = min_t(u32, vma_len, inq); |
---|
| 1896 | + aligned_len = avail_len & ~(PAGE_SIZE - 1); |
---|
| 1897 | + if (aligned_len) { |
---|
| 1898 | + zap_page_range(vma, address, aligned_len); |
---|
| 1899 | + zc->length = aligned_len; |
---|
| 1900 | + zc->recv_skip_hint = 0; |
---|
| 1901 | + } else { |
---|
| 1902 | + zc->length = avail_len; |
---|
| 1903 | + zc->recv_skip_hint = avail_len; |
---|
| 1904 | + } |
---|
1802 | 1905 | ret = 0; |
---|
| 1906 | + curr_addr = address; |
---|
1803 | 1907 | while (length + PAGE_SIZE <= zc->length) { |
---|
1804 | 1908 | if (zc->recv_skip_hint < PAGE_SIZE) { |
---|
| 1909 | + u32 offset_frag; |
---|
| 1910 | + |
---|
| 1911 | + /* If we're here, finish the current batch. */ |
---|
| 1912 | + if (pg_idx) { |
---|
| 1913 | + ret = tcp_zerocopy_vm_insert_batch(vma, pages, |
---|
| 1914 | + pg_idx, |
---|
| 1915 | + &curr_addr, |
---|
| 1916 | + &length, |
---|
| 1917 | + &seq, zc); |
---|
| 1918 | + if (ret) |
---|
| 1919 | + goto out; |
---|
| 1920 | + pg_idx = 0; |
---|
| 1921 | + } |
---|
1805 | 1922 | if (skb) { |
---|
| 1923 | + if (zc->recv_skip_hint > 0) |
---|
| 1924 | + break; |
---|
1806 | 1925 | skb = skb->next; |
---|
1807 | 1926 | offset = seq - TCP_SKB_CB(skb)->seq; |
---|
1808 | 1927 | } else { |
---|
1809 | 1928 | skb = tcp_recv_skb(sk, seq, &offset); |
---|
1810 | 1929 | } |
---|
1811 | | - |
---|
1812 | 1930 | zc->recv_skip_hint = skb->len - offset; |
---|
1813 | | - offset -= skb_headlen(skb); |
---|
1814 | | - if ((int)offset < 0 || skb_has_frag_list(skb)) |
---|
| 1931 | + frags = skb_advance_to_frag(skb, offset, &offset_frag); |
---|
| 1932 | + if (!frags || offset_frag) |
---|
1815 | 1933 | break; |
---|
1816 | | - frags = skb_shinfo(skb)->frags; |
---|
1817 | | - while (offset) { |
---|
1818 | | - if (frags->size > offset) |
---|
1819 | | - goto out; |
---|
1820 | | - offset -= frags->size; |
---|
| 1934 | + } |
---|
| 1935 | + if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { |
---|
| 1936 | + int remaining = zc->recv_skip_hint; |
---|
| 1937 | + |
---|
| 1938 | + while (remaining && (skb_frag_size(frags) != PAGE_SIZE || |
---|
| 1939 | + skb_frag_off(frags))) { |
---|
| 1940 | + remaining -= skb_frag_size(frags); |
---|
1821 | 1941 | frags++; |
---|
1822 | 1942 | } |
---|
| 1943 | + zc->recv_skip_hint -= remaining; |
---|
| 1944 | + break; |
---|
1823 | 1945 | } |
---|
1824 | | - if (frags->size != PAGE_SIZE || frags->page_offset) |
---|
1825 | | - break; |
---|
1826 | | - ret = vm_insert_page(vma, address + length, |
---|
1827 | | - skb_frag_page(frags)); |
---|
1828 | | - if (ret) |
---|
1829 | | - break; |
---|
| 1946 | + pages[pg_idx] = skb_frag_page(frags); |
---|
| 1947 | + pg_idx++; |
---|
1830 | 1948 | length += PAGE_SIZE; |
---|
1831 | | - seq += PAGE_SIZE; |
---|
1832 | 1949 | zc->recv_skip_hint -= PAGE_SIZE; |
---|
1833 | 1950 | frags++; |
---|
| 1951 | + if (pg_idx == PAGE_BATCH_SIZE) { |
---|
| 1952 | + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, |
---|
| 1953 | + &curr_addr, &length, |
---|
| 1954 | + &seq, zc); |
---|
| 1955 | + if (ret) |
---|
| 1956 | + goto out; |
---|
| 1957 | + pg_idx = 0; |
---|
| 1958 | + } |
---|
| 1959 | + } |
---|
| 1960 | + if (pg_idx) { |
---|
| 1961 | + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, |
---|
| 1962 | + &curr_addr, &length, &seq, |
---|
| 1963 | + zc); |
---|
1834 | 1964 | } |
---|
1835 | 1965 | out: |
---|
1836 | | - up_read(¤t->mm->mmap_sem); |
---|
1837 | | - if (length) { |
---|
| 1966 | + mmap_read_unlock(current->mm); |
---|
| 1967 | + /* Try to copy straggler data. */ |
---|
| 1968 | + if (!ret) |
---|
| 1969 | + copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq, |
---|
| 1970 | + copybuf_len); |
---|
| 1971 | + |
---|
| 1972 | + if (length + copylen) { |
---|
1838 | 1973 | WRITE_ONCE(tp->copied_seq, seq); |
---|
1839 | 1974 | tcp_rcv_space_adjust(sk); |
---|
1840 | 1975 | |
---|
1841 | 1976 | /* Clean up data we have read: This will do ACK frames. */ |
---|
1842 | 1977 | tcp_recv_skb(sk, seq, &offset); |
---|
1843 | | - tcp_cleanup_rbuf(sk, length); |
---|
| 1978 | + tcp_cleanup_rbuf(sk, length + copylen); |
---|
1844 | 1979 | ret = 0; |
---|
1845 | 1980 | if (length == zc->length) |
---|
1846 | 1981 | zc->recv_skip_hint = 0; |
---|
.. | .. |
---|
1854 | 1989 | #endif |
---|
1855 | 1990 | |
---|
1856 | 1991 | static void tcp_update_recv_tstamps(struct sk_buff *skb, |
---|
1857 | | - struct scm_timestamping *tss) |
---|
| 1992 | + struct scm_timestamping_internal *tss) |
---|
1858 | 1993 | { |
---|
1859 | 1994 | if (skb->tstamp) |
---|
1860 | | - tss->ts[0] = ktime_to_timespec(skb->tstamp); |
---|
| 1995 | + tss->ts[0] = ktime_to_timespec64(skb->tstamp); |
---|
1861 | 1996 | else |
---|
1862 | | - tss->ts[0] = (struct timespec) {0}; |
---|
| 1997 | + tss->ts[0] = (struct timespec64) {0}; |
---|
1863 | 1998 | |
---|
1864 | 1999 | if (skb_hwtstamps(skb)->hwtstamp) |
---|
1865 | | - tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp); |
---|
| 2000 | + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); |
---|
1866 | 2001 | else |
---|
1867 | | - tss->ts[2] = (struct timespec) {0}; |
---|
| 2002 | + tss->ts[2] = (struct timespec64) {0}; |
---|
1868 | 2003 | } |
---|
1869 | 2004 | |
---|
1870 | 2005 | /* Similar to __sock_recv_timestamp, but does not require an skb */ |
---|
1871 | 2006 | static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, |
---|
1872 | | - struct scm_timestamping *tss) |
---|
| 2007 | + struct scm_timestamping_internal *tss) |
---|
1873 | 2008 | { |
---|
1874 | | - struct timeval tv; |
---|
| 2009 | + int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); |
---|
1875 | 2010 | bool has_timestamping = false; |
---|
1876 | 2011 | |
---|
1877 | 2012 | if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { |
---|
1878 | 2013 | if (sock_flag(sk, SOCK_RCVTSTAMP)) { |
---|
1879 | 2014 | if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { |
---|
1880 | | - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, |
---|
1881 | | - sizeof(tss->ts[0]), &tss->ts[0]); |
---|
| 2015 | + if (new_tstamp) { |
---|
| 2016 | + struct __kernel_timespec kts = { |
---|
| 2017 | + .tv_sec = tss->ts[0].tv_sec, |
---|
| 2018 | + .tv_nsec = tss->ts[0].tv_nsec, |
---|
| 2019 | + }; |
---|
| 2020 | + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, |
---|
| 2021 | + sizeof(kts), &kts); |
---|
| 2022 | + } else { |
---|
| 2023 | + struct __kernel_old_timespec ts_old = { |
---|
| 2024 | + .tv_sec = tss->ts[0].tv_sec, |
---|
| 2025 | + .tv_nsec = tss->ts[0].tv_nsec, |
---|
| 2026 | + }; |
---|
| 2027 | + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, |
---|
| 2028 | + sizeof(ts_old), &ts_old); |
---|
| 2029 | + } |
---|
1882 | 2030 | } else { |
---|
1883 | | - tv.tv_sec = tss->ts[0].tv_sec; |
---|
1884 | | - tv.tv_usec = tss->ts[0].tv_nsec / 1000; |
---|
1885 | | - |
---|
1886 | | - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, |
---|
1887 | | - sizeof(tv), &tv); |
---|
| 2031 | + if (new_tstamp) { |
---|
| 2032 | + struct __kernel_sock_timeval stv = { |
---|
| 2033 | + .tv_sec = tss->ts[0].tv_sec, |
---|
| 2034 | + .tv_usec = tss->ts[0].tv_nsec / 1000, |
---|
| 2035 | + }; |
---|
| 2036 | + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, |
---|
| 2037 | + sizeof(stv), &stv); |
---|
| 2038 | + } else { |
---|
| 2039 | + struct __kernel_old_timeval tv = { |
---|
| 2040 | + .tv_sec = tss->ts[0].tv_sec, |
---|
| 2041 | + .tv_usec = tss->ts[0].tv_nsec / 1000, |
---|
| 2042 | + }; |
---|
| 2043 | + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, |
---|
| 2044 | + sizeof(tv), &tv); |
---|
| 2045 | + } |
---|
1888 | 2046 | } |
---|
1889 | 2047 | } |
---|
1890 | 2048 | |
---|
1891 | 2049 | if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) |
---|
1892 | 2050 | has_timestamping = true; |
---|
1893 | 2051 | else |
---|
1894 | | - tss->ts[0] = (struct timespec) {0}; |
---|
| 2052 | + tss->ts[0] = (struct timespec64) {0}; |
---|
1895 | 2053 | } |
---|
1896 | 2054 | |
---|
1897 | 2055 | if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { |
---|
1898 | 2056 | if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) |
---|
1899 | 2057 | has_timestamping = true; |
---|
1900 | 2058 | else |
---|
1901 | | - tss->ts[2] = (struct timespec) {0}; |
---|
| 2059 | + tss->ts[2] = (struct timespec64) {0}; |
---|
1902 | 2060 | } |
---|
1903 | 2061 | |
---|
1904 | 2062 | if (has_timestamping) { |
---|
1905 | | - tss->ts[1] = (struct timespec) {0}; |
---|
1906 | | - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, |
---|
1907 | | - sizeof(*tss), tss); |
---|
| 2063 | + tss->ts[1] = (struct timespec64) {0}; |
---|
| 2064 | + if (sock_flag(sk, SOCK_TSTAMP_NEW)) |
---|
| 2065 | + put_cmsg_scm_timestamping64(msg, tss); |
---|
| 2066 | + else |
---|
| 2067 | + put_cmsg_scm_timestamping(msg, tss); |
---|
1908 | 2068 | } |
---|
1909 | 2069 | } |
---|
1910 | 2070 | |
---|
.. | .. |
---|
1950 | 2110 | long timeo; |
---|
1951 | 2111 | struct sk_buff *skb, *last; |
---|
1952 | 2112 | u32 urg_hole = 0; |
---|
1953 | | - struct scm_timestamping tss; |
---|
1954 | | - bool has_tss = false; |
---|
1955 | | - bool has_cmsg; |
---|
| 2113 | + struct scm_timestamping_internal tss; |
---|
| 2114 | + int cmsg_flags; |
---|
1956 | 2115 | |
---|
1957 | 2116 | if (unlikely(flags & MSG_ERRQUEUE)) |
---|
1958 | 2117 | return inet_recv_error(sk, msg, len, addr_len); |
---|
| 2118 | + trace_android_rvh_tcp_recvmsg(sk); |
---|
1959 | 2119 | |
---|
1960 | 2120 | if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && |
---|
1961 | 2121 | (sk->sk_state == TCP_ESTABLISHED)) |
---|
.. | .. |
---|
1967 | 2127 | if (sk->sk_state == TCP_LISTEN) |
---|
1968 | 2128 | goto out; |
---|
1969 | 2129 | |
---|
1970 | | - has_cmsg = tp->recvmsg_inq; |
---|
| 2130 | + cmsg_flags = tp->recvmsg_inq ? 1 : 0; |
---|
1971 | 2131 | timeo = sock_rcvtimeo(sk, nonblock); |
---|
1972 | 2132 | |
---|
1973 | 2133 | /* Urgent data needs to be handled specially. */ |
---|
.. | .. |
---|
2100 | 2260 | } |
---|
2101 | 2261 | continue; |
---|
2102 | 2262 | |
---|
2103 | | - found_ok_skb: |
---|
| 2263 | +found_ok_skb: |
---|
2104 | 2264 | /* Ok so how much can we use? */ |
---|
2105 | 2265 | used = skb->len - offset; |
---|
2106 | 2266 | if (len < used) |
---|
.. | .. |
---|
2148 | 2308 | |
---|
2149 | 2309 | if (TCP_SKB_CB(skb)->has_rxtstamp) { |
---|
2150 | 2310 | tcp_update_recv_tstamps(skb, &tss); |
---|
2151 | | - has_tss = true; |
---|
2152 | | - has_cmsg = true; |
---|
| 2311 | + cmsg_flags |= 2; |
---|
2153 | 2312 | } |
---|
2154 | 2313 | |
---|
2155 | 2314 | if (used + offset < skb->len) |
---|
.. | .. |
---|
2161 | 2320 | sk_eat_skb(sk, skb); |
---|
2162 | 2321 | continue; |
---|
2163 | 2322 | |
---|
2164 | | - found_fin_ok: |
---|
| 2323 | +found_fin_ok: |
---|
2165 | 2324 | /* Process the FIN. */ |
---|
2166 | 2325 | WRITE_ONCE(*seq, *seq + 1); |
---|
2167 | 2326 | if (!(flags & MSG_PEEK)) |
---|
.. | .. |
---|
2169 | 2328 | break; |
---|
2170 | 2329 | } while (len > 0); |
---|
2171 | 2330 | |
---|
| 2331 | + trace_android_rvh_tcp_recvmsg_stat(sk, copied); |
---|
2172 | 2332 | /* According to UNIX98, msg_name/msg_namelen are ignored |
---|
2173 | 2333 | * on connected socket. I was just happy when found this 8) --ANK |
---|
2174 | 2334 | */ |
---|
.. | .. |
---|
2178 | 2338 | |
---|
2179 | 2339 | release_sock(sk); |
---|
2180 | 2340 | |
---|
2181 | | - if (has_cmsg) { |
---|
2182 | | - if (has_tss) |
---|
| 2341 | + if (cmsg_flags) { |
---|
| 2342 | + if (cmsg_flags & 2) |
---|
2183 | 2343 | tcp_recv_timestamp(msg, sk, &tss); |
---|
2184 | | - if (tp->recvmsg_inq) { |
---|
| 2344 | + if (cmsg_flags & 1) { |
---|
2185 | 2345 | inq = tcp_inq_hint(sk); |
---|
2186 | 2346 | put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); |
---|
2187 | 2347 | } |
---|
.. | .. |
---|
2245 | 2405 | if (inet_csk(sk)->icsk_bind_hash && |
---|
2246 | 2406 | !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) |
---|
2247 | 2407 | inet_put_port(sk); |
---|
2248 | | - /* fall through */ |
---|
| 2408 | + fallthrough; |
---|
2249 | 2409 | default: |
---|
2250 | 2410 | if (oldstate == TCP_ESTABLISHED) |
---|
2251 | 2411 | TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); |
---|
.. | .. |
---|
2255 | 2415 | * socket sitting in hash tables. |
---|
2256 | 2416 | */ |
---|
2257 | 2417 | inet_sk_state_store(sk, state); |
---|
2258 | | - |
---|
2259 | | -#ifdef STATE_TRACE |
---|
2260 | | - SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); |
---|
2261 | | -#endif |
---|
2262 | 2418 | } |
---|
2263 | 2419 | EXPORT_SYMBOL_GPL(tcp_set_state); |
---|
2264 | 2420 | |
---|
.. | .. |
---|
2335 | 2491 | return too_many_orphans || out_of_socket_memory; |
---|
2336 | 2492 | } |
---|
2337 | 2493 | |
---|
2338 | | -void tcp_close(struct sock *sk, long timeout) |
---|
| 2494 | +void __tcp_close(struct sock *sk, long timeout) |
---|
2339 | 2495 | { |
---|
2340 | 2496 | struct sk_buff *skb; |
---|
2341 | 2497 | int data_was_unread = 0; |
---|
2342 | 2498 | int state; |
---|
2343 | 2499 | |
---|
2344 | | - lock_sock(sk); |
---|
2345 | | - sk->sk_shutdown = SHUTDOWN_MASK; |
---|
| 2500 | + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); |
---|
2346 | 2501 | |
---|
2347 | 2502 | if (sk->sk_state == TCP_LISTEN) { |
---|
2348 | 2503 | tcp_set_state(sk, TCP_CLOSE); |
---|
.. | .. |
---|
2488 | 2643 | } |
---|
2489 | 2644 | |
---|
2490 | 2645 | if (sk->sk_state == TCP_CLOSE) { |
---|
2491 | | - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; |
---|
| 2646 | + struct request_sock *req; |
---|
| 2647 | + |
---|
| 2648 | + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, |
---|
| 2649 | + lockdep_sock_is_held(sk)); |
---|
2492 | 2650 | /* We could get here with a non-NULL req if the socket is |
---|
2493 | 2651 | * aborted (e.g., closed with unread data) before 3WHS |
---|
2494 | 2652 | * finishes. |
---|
.. | .. |
---|
2502 | 2660 | out: |
---|
2503 | 2661 | bh_unlock_sock(sk); |
---|
2504 | 2662 | local_bh_enable(); |
---|
| 2663 | +} |
---|
| 2664 | + |
---|
| 2665 | +void tcp_close(struct sock *sk, long timeout) |
---|
| 2666 | +{ |
---|
| 2667 | + lock_sock(sk); |
---|
| 2668 | + __tcp_close(sk, timeout); |
---|
2505 | 2669 | release_sock(sk); |
---|
2506 | 2670 | sock_put(sk); |
---|
2507 | 2671 | } |
---|
.. | .. |
---|
2543 | 2707 | sk_wmem_free_skb(sk, skb); |
---|
2544 | 2708 | } |
---|
2545 | 2709 | tcp_rtx_queue_purge(sk); |
---|
| 2710 | + skb = sk->sk_tx_skb_cache; |
---|
| 2711 | + if (skb) { |
---|
| 2712 | + __kfree_skb(skb); |
---|
| 2713 | + sk->sk_tx_skb_cache = NULL; |
---|
| 2714 | + } |
---|
2546 | 2715 | INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); |
---|
2547 | 2716 | sk_mem_reclaim(sk); |
---|
2548 | 2717 | tcp_clear_all_retrans_hints(tcp_sk(sk)); |
---|
.. | .. |
---|
2579 | 2748 | |
---|
2580 | 2749 | tcp_clear_xmit_timers(sk); |
---|
2581 | 2750 | __skb_queue_purge(&sk->sk_receive_queue); |
---|
| 2751 | + if (sk->sk_rx_skb_cache) { |
---|
| 2752 | + __kfree_skb(sk->sk_rx_skb_cache); |
---|
| 2753 | + sk->sk_rx_skb_cache = NULL; |
---|
| 2754 | + } |
---|
2582 | 2755 | WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); |
---|
2583 | 2756 | tp->urg_data = 0; |
---|
2584 | 2757 | tcp_write_queue_purge(sk); |
---|
.. | .. |
---|
2590 | 2763 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) |
---|
2591 | 2764 | inet_reset_saddr(sk); |
---|
2592 | 2765 | |
---|
2593 | | - sk->sk_shutdown = 0; |
---|
| 2766 | + WRITE_ONCE(sk->sk_shutdown, 0); |
---|
2594 | 2767 | sock_reset_flag(sk, SOCK_DONE); |
---|
2595 | 2768 | tp->srtt_us = 0; |
---|
| 2769 | + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); |
---|
2596 | 2770 | tp->rcv_rtt_last_tsecr = 0; |
---|
2597 | 2771 | |
---|
2598 | 2772 | seq = tp->write_seq + tp->max_window + 2; |
---|
.. | .. |
---|
2600 | 2774 | seq = 1; |
---|
2601 | 2775 | WRITE_ONCE(tp->write_seq, seq); |
---|
2602 | 2776 | |
---|
2603 | | - tp->snd_cwnd = 2; |
---|
| 2777 | + icsk->icsk_backoff = 0; |
---|
2604 | 2778 | icsk->icsk_probes_out = 0; |
---|
| 2779 | + icsk->icsk_probes_tstamp = 0; |
---|
| 2780 | + icsk->icsk_rto = TCP_TIMEOUT_INIT; |
---|
| 2781 | + icsk->icsk_rto_min = TCP_RTO_MIN; |
---|
| 2782 | + icsk->icsk_delack_max = TCP_DELACK_MAX; |
---|
2605 | 2783 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
---|
| 2784 | + tp->snd_cwnd = TCP_INIT_CWND; |
---|
2606 | 2785 | tp->snd_cwnd_cnt = 0; |
---|
| 2786 | + tp->is_cwnd_limited = 0; |
---|
| 2787 | + tp->max_packets_out = 0; |
---|
2607 | 2788 | tp->window_clamp = 0; |
---|
2608 | 2789 | tp->delivered = 0; |
---|
2609 | 2790 | tp->delivered_ce = 0; |
---|
2610 | 2791 | if (icsk->icsk_ca_ops->release) |
---|
2611 | 2792 | icsk->icsk_ca_ops->release(sk); |
---|
2612 | 2793 | memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); |
---|
| 2794 | + icsk->icsk_ca_initialized = 0; |
---|
2613 | 2795 | tcp_set_ca_state(sk, TCP_CA_Open); |
---|
2614 | 2796 | tp->is_sack_reneg = 0; |
---|
2615 | 2797 | tcp_clear_retrans(tp); |
---|
.. | .. |
---|
2621 | 2803 | icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; |
---|
2622 | 2804 | memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); |
---|
2623 | 2805 | __sk_dst_reset(sk); |
---|
2624 | | - dst_release(sk->sk_rx_dst); |
---|
2625 | | - sk->sk_rx_dst = NULL; |
---|
| 2806 | + dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); |
---|
2626 | 2807 | tcp_saved_syn_free(tp); |
---|
2627 | 2808 | tp->compressed_ack = 0; |
---|
2628 | 2809 | tp->segs_in = 0; |
---|
.. | .. |
---|
2633 | 2814 | tp->bytes_retrans = 0; |
---|
2634 | 2815 | tp->data_segs_in = 0; |
---|
2635 | 2816 | tp->data_segs_out = 0; |
---|
| 2817 | + tp->duplicate_sack[0].start_seq = 0; |
---|
| 2818 | + tp->duplicate_sack[0].end_seq = 0; |
---|
2636 | 2819 | tp->dsack_dups = 0; |
---|
2637 | 2820 | tp->reord_seen = 0; |
---|
| 2821 | + tp->retrans_out = 0; |
---|
| 2822 | + tp->sacked_out = 0; |
---|
| 2823 | + tp->tlp_high_seq = 0; |
---|
| 2824 | + tp->last_oow_ack_time = 0; |
---|
| 2825 | + /* There's a bubble in the pipe until at least the first ACK. */ |
---|
| 2826 | + tp->app_limited = ~0U; |
---|
| 2827 | + tp->rate_app_limited = 1; |
---|
| 2828 | + tp->rack.mstamp = 0; |
---|
| 2829 | + tp->rack.advanced = 0; |
---|
| 2830 | + tp->rack.reo_wnd_steps = 1; |
---|
| 2831 | + tp->rack.last_delivered = 0; |
---|
| 2832 | + tp->rack.reo_wnd_persist = 0; |
---|
| 2833 | + tp->rack.dsack_seen = 0; |
---|
| 2834 | + tp->syn_data_acked = 0; |
---|
| 2835 | + tp->rx_opt.saw_tstamp = 0; |
---|
| 2836 | + tp->rx_opt.dsack = 0; |
---|
| 2837 | + tp->rx_opt.num_sacks = 0; |
---|
| 2838 | + tp->rcv_ooopack = 0; |
---|
| 2839 | + |
---|
2638 | 2840 | |
---|
2639 | 2841 | /* Clean up fastopen related fields */ |
---|
2640 | 2842 | tcp_free_fastopen_req(tp); |
---|
2641 | 2843 | inet->defer_connect = 0; |
---|
| 2844 | + tp->fastopen_client_fail = 0; |
---|
2642 | 2845 | |
---|
2643 | 2846 | WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); |
---|
2644 | 2847 | |
---|
.. | .. |
---|
2659 | 2862 | (sk->sk_state != TCP_LISTEN); |
---|
2660 | 2863 | } |
---|
2661 | 2864 | |
---|
2662 | | -static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) |
---|
| 2865 | +static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) |
---|
2663 | 2866 | { |
---|
2664 | 2867 | struct tcp_repair_window opt; |
---|
2665 | 2868 | |
---|
.. | .. |
---|
2669 | 2872 | if (len != sizeof(opt)) |
---|
2670 | 2873 | return -EINVAL; |
---|
2671 | 2874 | |
---|
2672 | | - if (copy_from_user(&opt, optbuf, sizeof(opt))) |
---|
| 2875 | + if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) |
---|
2673 | 2876 | return -EFAULT; |
---|
2674 | 2877 | |
---|
2675 | 2878 | if (opt.max_window < opt.snd_wnd) |
---|
.. | .. |
---|
2691 | 2894 | return 0; |
---|
2692 | 2895 | } |
---|
2693 | 2896 | |
---|
2694 | | -static int tcp_repair_options_est(struct sock *sk, |
---|
2695 | | - struct tcp_repair_opt __user *optbuf, unsigned int len) |
---|
| 2897 | +static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, |
---|
| 2898 | + unsigned int len) |
---|
2696 | 2899 | { |
---|
2697 | 2900 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2698 | 2901 | struct tcp_repair_opt opt; |
---|
| 2902 | + size_t offset = 0; |
---|
2699 | 2903 | |
---|
2700 | 2904 | while (len >= sizeof(opt)) { |
---|
2701 | | - if (copy_from_user(&opt, optbuf, sizeof(opt))) |
---|
| 2905 | + if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) |
---|
2702 | 2906 | return -EFAULT; |
---|
2703 | 2907 | |
---|
2704 | | - optbuf++; |
---|
| 2908 | + offset += sizeof(opt); |
---|
2705 | 2909 | len -= sizeof(opt); |
---|
2706 | 2910 | |
---|
2707 | 2911 | switch (opt.opt_code) { |
---|
.. | .. |
---|
2740 | 2944 | return 0; |
---|
2741 | 2945 | } |
---|
2742 | 2946 | |
---|
| 2947 | +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); |
---|
| 2948 | +EXPORT_SYMBOL(tcp_tx_delay_enabled); |
---|
| 2949 | + |
---|
| 2950 | +static void tcp_enable_tx_delay(void) |
---|
| 2951 | +{ |
---|
| 2952 | + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { |
---|
| 2953 | + static int __tcp_tx_delay_enabled = 0; |
---|
| 2954 | + |
---|
| 2955 | + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { |
---|
| 2956 | + static_branch_enable(&tcp_tx_delay_enabled); |
---|
| 2957 | + pr_info("TCP_TX_DELAY enabled\n"); |
---|
| 2958 | + } |
---|
| 2959 | + } |
---|
| 2960 | +} |
---|
| 2961 | + |
---|
| 2962 | +/* When set indicates to always queue non-full frames. Later the user clears |
---|
| 2963 | + * this option and we transmit any pending partial frames in the queue. This is |
---|
| 2964 | + * meant to be used alongside sendfile() to get properly filled frames when the |
---|
| 2965 | + * user (for example) must write out headers with a write() call first and then |
---|
| 2966 | + * use sendfile to send out the data parts. |
---|
| 2967 | + * |
---|
| 2968 | + * TCP_CORK can be set together with TCP_NODELAY and it is stronger than |
---|
| 2969 | + * TCP_NODELAY. |
---|
| 2970 | + */ |
---|
| 2971 | +static void __tcp_sock_set_cork(struct sock *sk, bool on) |
---|
| 2972 | +{ |
---|
| 2973 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2974 | + |
---|
| 2975 | + if (on) { |
---|
| 2976 | + tp->nonagle |= TCP_NAGLE_CORK; |
---|
| 2977 | + } else { |
---|
| 2978 | + tp->nonagle &= ~TCP_NAGLE_CORK; |
---|
| 2979 | + if (tp->nonagle & TCP_NAGLE_OFF) |
---|
| 2980 | + tp->nonagle |= TCP_NAGLE_PUSH; |
---|
| 2981 | + tcp_push_pending_frames(sk); |
---|
| 2982 | + } |
---|
| 2983 | +} |
---|
| 2984 | + |
---|
| 2985 | +void tcp_sock_set_cork(struct sock *sk, bool on) |
---|
| 2986 | +{ |
---|
| 2987 | + lock_sock(sk); |
---|
| 2988 | + __tcp_sock_set_cork(sk, on); |
---|
| 2989 | + release_sock(sk); |
---|
| 2990 | +} |
---|
| 2991 | +EXPORT_SYMBOL(tcp_sock_set_cork); |
---|
| 2992 | + |
---|
| 2993 | +/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is |
---|
| 2994 | + * remembered, but it is not activated until cork is cleared. |
---|
| 2995 | + * |
---|
| 2996 | + * However, when TCP_NODELAY is set we make an explicit push, which overrides |
---|
| 2997 | + * even TCP_CORK for currently queued segments. |
---|
| 2998 | + */ |
---|
| 2999 | +static void __tcp_sock_set_nodelay(struct sock *sk, bool on) |
---|
| 3000 | +{ |
---|
| 3001 | + if (on) { |
---|
| 3002 | + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; |
---|
| 3003 | + tcp_push_pending_frames(sk); |
---|
| 3004 | + } else { |
---|
| 3005 | + tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; |
---|
| 3006 | + } |
---|
| 3007 | +} |
---|
| 3008 | + |
---|
| 3009 | +void tcp_sock_set_nodelay(struct sock *sk) |
---|
| 3010 | +{ |
---|
| 3011 | + lock_sock(sk); |
---|
| 3012 | + __tcp_sock_set_nodelay(sk, true); |
---|
| 3013 | + release_sock(sk); |
---|
| 3014 | +} |
---|
| 3015 | +EXPORT_SYMBOL(tcp_sock_set_nodelay); |
---|
| 3016 | + |
---|
| 3017 | +static void __tcp_sock_set_quickack(struct sock *sk, int val) |
---|
| 3018 | +{ |
---|
| 3019 | + if (!val) { |
---|
| 3020 | + inet_csk_enter_pingpong_mode(sk); |
---|
| 3021 | + return; |
---|
| 3022 | + } |
---|
| 3023 | + |
---|
| 3024 | + inet_csk_exit_pingpong_mode(sk); |
---|
| 3025 | + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
---|
| 3026 | + inet_csk_ack_scheduled(sk)) { |
---|
| 3027 | + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; |
---|
| 3028 | + tcp_cleanup_rbuf(sk, 1); |
---|
| 3029 | + if (!(val & 1)) |
---|
| 3030 | + inet_csk_enter_pingpong_mode(sk); |
---|
| 3031 | + } |
---|
| 3032 | +} |
---|
| 3033 | + |
---|
| 3034 | +void tcp_sock_set_quickack(struct sock *sk, int val) |
---|
| 3035 | +{ |
---|
| 3036 | + lock_sock(sk); |
---|
| 3037 | + __tcp_sock_set_quickack(sk, val); |
---|
| 3038 | + release_sock(sk); |
---|
| 3039 | +} |
---|
| 3040 | +EXPORT_SYMBOL(tcp_sock_set_quickack); |
---|
| 3041 | + |
---|
| 3042 | +int tcp_sock_set_syncnt(struct sock *sk, int val) |
---|
| 3043 | +{ |
---|
| 3044 | + if (val < 1 || val > MAX_TCP_SYNCNT) |
---|
| 3045 | + return -EINVAL; |
---|
| 3046 | + |
---|
| 3047 | + lock_sock(sk); |
---|
| 3048 | + inet_csk(sk)->icsk_syn_retries = val; |
---|
| 3049 | + release_sock(sk); |
---|
| 3050 | + return 0; |
---|
| 3051 | +} |
---|
| 3052 | +EXPORT_SYMBOL(tcp_sock_set_syncnt); |
---|
| 3053 | + |
---|
| 3054 | +void tcp_sock_set_user_timeout(struct sock *sk, u32 val) |
---|
| 3055 | +{ |
---|
| 3056 | + lock_sock(sk); |
---|
| 3057 | + WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); |
---|
| 3058 | + release_sock(sk); |
---|
| 3059 | +} |
---|
| 3060 | +EXPORT_SYMBOL(tcp_sock_set_user_timeout); |
---|
| 3061 | + |
---|
| 3062 | +int tcp_sock_set_keepidle_locked(struct sock *sk, int val) |
---|
| 3063 | +{ |
---|
| 3064 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 3065 | + |
---|
| 3066 | + if (val < 1 || val > MAX_TCP_KEEPIDLE) |
---|
| 3067 | + return -EINVAL; |
---|
| 3068 | + |
---|
| 3069 | + /* Paired with WRITE_ONCE() in keepalive_time_when() */ |
---|
| 3070 | + WRITE_ONCE(tp->keepalive_time, val * HZ); |
---|
| 3071 | + if (sock_flag(sk, SOCK_KEEPOPEN) && |
---|
| 3072 | + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { |
---|
| 3073 | + u32 elapsed = keepalive_time_elapsed(tp); |
---|
| 3074 | + |
---|
| 3075 | + if (tp->keepalive_time > elapsed) |
---|
| 3076 | + elapsed = tp->keepalive_time - elapsed; |
---|
| 3077 | + else |
---|
| 3078 | + elapsed = 0; |
---|
| 3079 | + inet_csk_reset_keepalive_timer(sk, elapsed); |
---|
| 3080 | + } |
---|
| 3081 | + |
---|
| 3082 | + return 0; |
---|
| 3083 | +} |
---|
| 3084 | + |
---|
| 3085 | +int tcp_sock_set_keepidle(struct sock *sk, int val) |
---|
| 3086 | +{ |
---|
| 3087 | + int err; |
---|
| 3088 | + |
---|
| 3089 | + lock_sock(sk); |
---|
| 3090 | + err = tcp_sock_set_keepidle_locked(sk, val); |
---|
| 3091 | + release_sock(sk); |
---|
| 3092 | + return err; |
---|
| 3093 | +} |
---|
| 3094 | +EXPORT_SYMBOL(tcp_sock_set_keepidle); |
---|
| 3095 | + |
---|
| 3096 | +int tcp_sock_set_keepintvl(struct sock *sk, int val) |
---|
| 3097 | +{ |
---|
| 3098 | + if (val < 1 || val > MAX_TCP_KEEPINTVL) |
---|
| 3099 | + return -EINVAL; |
---|
| 3100 | + |
---|
| 3101 | + lock_sock(sk); |
---|
| 3102 | + WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); |
---|
| 3103 | + release_sock(sk); |
---|
| 3104 | + return 0; |
---|
| 3105 | +} |
---|
| 3106 | +EXPORT_SYMBOL(tcp_sock_set_keepintvl); |
---|
| 3107 | + |
---|
| 3108 | +int tcp_sock_set_keepcnt(struct sock *sk, int val) |
---|
| 3109 | +{ |
---|
| 3110 | + if (val < 1 || val > MAX_TCP_KEEPCNT) |
---|
| 3111 | + return -EINVAL; |
---|
| 3112 | + |
---|
| 3113 | + lock_sock(sk); |
---|
| 3114 | + /* Paired with READ_ONCE() in keepalive_probes() */ |
---|
| 3115 | + WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); |
---|
| 3116 | + release_sock(sk); |
---|
| 3117 | + return 0; |
---|
| 3118 | +} |
---|
| 3119 | +EXPORT_SYMBOL(tcp_sock_set_keepcnt); |
---|
| 3120 | + |
---|
2743 | 3121 | /* |
---|
2744 | 3122 | * Socket option code for TCP. |
---|
2745 | 3123 | */ |
---|
2746 | | -static int do_tcp_setsockopt(struct sock *sk, int level, |
---|
2747 | | - int optname, char __user *optval, unsigned int optlen) |
---|
| 3124 | +static int do_tcp_setsockopt(struct sock *sk, int level, int optname, |
---|
| 3125 | + sockptr_t optval, unsigned int optlen) |
---|
2748 | 3126 | { |
---|
2749 | 3127 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2750 | 3128 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
.. | .. |
---|
2760 | 3138 | if (optlen < 1) |
---|
2761 | 3139 | return -EINVAL; |
---|
2762 | 3140 | |
---|
2763 | | - val = strncpy_from_user(name, optval, |
---|
| 3141 | + val = strncpy_from_sockptr(name, optval, |
---|
2764 | 3142 | min_t(long, TCP_CA_NAME_MAX-1, optlen)); |
---|
2765 | 3143 | if (val < 0) |
---|
2766 | 3144 | return -EFAULT; |
---|
2767 | 3145 | name[val] = 0; |
---|
2768 | 3146 | |
---|
2769 | 3147 | lock_sock(sk); |
---|
2770 | | - err = tcp_set_congestion_control(sk, name, true, true, |
---|
| 3148 | + err = tcp_set_congestion_control(sk, name, true, |
---|
2771 | 3149 | ns_capable(sock_net(sk)->user_ns, |
---|
2772 | 3150 | CAP_NET_ADMIN)); |
---|
2773 | 3151 | release_sock(sk); |
---|
.. | .. |
---|
2779 | 3157 | if (optlen < 1) |
---|
2780 | 3158 | return -EINVAL; |
---|
2781 | 3159 | |
---|
2782 | | - val = strncpy_from_user(name, optval, |
---|
| 3160 | + val = strncpy_from_sockptr(name, optval, |
---|
2783 | 3161 | min_t(long, TCP_ULP_NAME_MAX - 1, |
---|
2784 | 3162 | optlen)); |
---|
2785 | 3163 | if (val < 0) |
---|
.. | .. |
---|
2792 | 3170 | return err; |
---|
2793 | 3171 | } |
---|
2794 | 3172 | case TCP_FASTOPEN_KEY: { |
---|
2795 | | - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; |
---|
| 3173 | + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; |
---|
| 3174 | + __u8 *backup_key = NULL; |
---|
2796 | 3175 | |
---|
2797 | | - if (optlen != sizeof(key)) |
---|
| 3176 | + /* Allow a backup key as well to facilitate key rotation |
---|
| 3177 | + * First key is the active one. |
---|
| 3178 | + */ |
---|
| 3179 | + if (optlen != TCP_FASTOPEN_KEY_LENGTH && |
---|
| 3180 | + optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) |
---|
2798 | 3181 | return -EINVAL; |
---|
2799 | 3182 | |
---|
2800 | | - if (copy_from_user(key, optval, optlen)) |
---|
| 3183 | + if (copy_from_sockptr(key, optval, optlen)) |
---|
2801 | 3184 | return -EFAULT; |
---|
2802 | 3185 | |
---|
2803 | | - return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); |
---|
| 3186 | + if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) |
---|
| 3187 | + backup_key = key + TCP_FASTOPEN_KEY_LENGTH; |
---|
| 3188 | + |
---|
| 3189 | + return tcp_fastopen_reset_cipher(net, sk, key, backup_key); |
---|
2804 | 3190 | } |
---|
2805 | 3191 | default: |
---|
2806 | 3192 | /* fallthru */ |
---|
.. | .. |
---|
2810 | 3196 | if (optlen < sizeof(int)) |
---|
2811 | 3197 | return -EINVAL; |
---|
2812 | 3198 | |
---|
2813 | | - if (get_user(val, (int __user *)optval)) |
---|
| 3199 | + if (copy_from_sockptr(&val, optval, sizeof(val))) |
---|
2814 | 3200 | return -EFAULT; |
---|
2815 | 3201 | |
---|
2816 | 3202 | lock_sock(sk); |
---|
.. | .. |
---|
2829 | 3215 | break; |
---|
2830 | 3216 | |
---|
2831 | 3217 | case TCP_NODELAY: |
---|
2832 | | - if (val) { |
---|
2833 | | - /* TCP_NODELAY is weaker than TCP_CORK, so that |
---|
2834 | | - * this option on corked socket is remembered, but |
---|
2835 | | - * it is not activated until cork is cleared. |
---|
2836 | | - * |
---|
2837 | | - * However, when TCP_NODELAY is set we make |
---|
2838 | | - * an explicit push, which overrides even TCP_CORK |
---|
2839 | | - * for currently queued segments. |
---|
2840 | | - */ |
---|
2841 | | - tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; |
---|
2842 | | - tcp_push_pending_frames(sk); |
---|
2843 | | - } else { |
---|
2844 | | - tp->nonagle &= ~TCP_NAGLE_OFF; |
---|
2845 | | - } |
---|
| 3218 | + __tcp_sock_set_nodelay(sk, val); |
---|
2846 | 3219 | break; |
---|
2847 | 3220 | |
---|
2848 | 3221 | case TCP_THIN_LINEAR_TIMEOUTS: |
---|
.. | .. |
---|
2908 | 3281 | case TCP_REPAIR_OPTIONS: |
---|
2909 | 3282 | if (!tp->repair) |
---|
2910 | 3283 | err = -EINVAL; |
---|
2911 | | - else if (sk->sk_state == TCP_ESTABLISHED) |
---|
2912 | | - err = tcp_repair_options_est(sk, |
---|
2913 | | - (struct tcp_repair_opt __user *)optval, |
---|
2914 | | - optlen); |
---|
| 3284 | + else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) |
---|
| 3285 | + err = tcp_repair_options_est(sk, optval, optlen); |
---|
2915 | 3286 | else |
---|
2916 | 3287 | err = -EPERM; |
---|
2917 | 3288 | break; |
---|
2918 | 3289 | |
---|
2919 | 3290 | case TCP_CORK: |
---|
2920 | | - /* When set indicates to always queue non-full frames. |
---|
2921 | | - * Later the user clears this option and we transmit |
---|
2922 | | - * any pending partial frames in the queue. This is |
---|
2923 | | - * meant to be used alongside sendfile() to get properly |
---|
2924 | | - * filled frames when the user (for example) must write |
---|
2925 | | - * out headers with a write() call first and then use |
---|
2926 | | - * sendfile to send out the data parts. |
---|
2927 | | - * |
---|
2928 | | - * TCP_CORK can be set together with TCP_NODELAY and it is |
---|
2929 | | - * stronger than TCP_NODELAY. |
---|
2930 | | - */ |
---|
2931 | | - if (val) { |
---|
2932 | | - tp->nonagle |= TCP_NAGLE_CORK; |
---|
2933 | | - } else { |
---|
2934 | | - tp->nonagle &= ~TCP_NAGLE_CORK; |
---|
2935 | | - if (tp->nonagle&TCP_NAGLE_OFF) |
---|
2936 | | - tp->nonagle |= TCP_NAGLE_PUSH; |
---|
2937 | | - tcp_push_pending_frames(sk); |
---|
2938 | | - } |
---|
| 3291 | + __tcp_sock_set_cork(sk, val); |
---|
2939 | 3292 | break; |
---|
2940 | 3293 | |
---|
2941 | 3294 | case TCP_KEEPIDLE: |
---|
2942 | | - if (val < 1 || val > MAX_TCP_KEEPIDLE) |
---|
2943 | | - err = -EINVAL; |
---|
2944 | | - else { |
---|
2945 | | - tp->keepalive_time = val * HZ; |
---|
2946 | | - if (sock_flag(sk, SOCK_KEEPOPEN) && |
---|
2947 | | - !((1 << sk->sk_state) & |
---|
2948 | | - (TCPF_CLOSE | TCPF_LISTEN))) { |
---|
2949 | | - u32 elapsed = keepalive_time_elapsed(tp); |
---|
2950 | | - if (tp->keepalive_time > elapsed) |
---|
2951 | | - elapsed = tp->keepalive_time - elapsed; |
---|
2952 | | - else |
---|
2953 | | - elapsed = 0; |
---|
2954 | | - inet_csk_reset_keepalive_timer(sk, elapsed); |
---|
2955 | | - } |
---|
2956 | | - } |
---|
| 3295 | + err = tcp_sock_set_keepidle_locked(sk, val); |
---|
2957 | 3296 | break; |
---|
2958 | 3297 | case TCP_KEEPINTVL: |
---|
2959 | 3298 | if (val < 1 || val > MAX_TCP_KEEPINTVL) |
---|
2960 | 3299 | err = -EINVAL; |
---|
2961 | 3300 | else |
---|
2962 | | - tp->keepalive_intvl = val * HZ; |
---|
| 3301 | + WRITE_ONCE(tp->keepalive_intvl, val * HZ); |
---|
2963 | 3302 | break; |
---|
2964 | 3303 | case TCP_KEEPCNT: |
---|
2965 | 3304 | if (val < 1 || val > MAX_TCP_KEEPCNT) |
---|
2966 | 3305 | err = -EINVAL; |
---|
2967 | 3306 | else |
---|
2968 | | - tp->keepalive_probes = val; |
---|
| 3307 | + WRITE_ONCE(tp->keepalive_probes, val); |
---|
2969 | 3308 | break; |
---|
2970 | 3309 | case TCP_SYNCNT: |
---|
2971 | 3310 | if (val < 1 || val > MAX_TCP_SYNCNT) |
---|
.. | .. |
---|
2975 | 3314 | break; |
---|
2976 | 3315 | |
---|
2977 | 3316 | case TCP_SAVE_SYN: |
---|
2978 | | - if (val < 0 || val > 1) |
---|
| 3317 | + /* 0: disable, 1: enable, 2: start from ether_header */ |
---|
| 3318 | + if (val < 0 || val > 2) |
---|
2979 | 3319 | err = -EINVAL; |
---|
2980 | 3320 | else |
---|
2981 | 3321 | tp->save_syn = val; |
---|
.. | .. |
---|
2983 | 3323 | |
---|
2984 | 3324 | case TCP_LINGER2: |
---|
2985 | 3325 | if (val < 0) |
---|
2986 | | - tp->linger2 = -1; |
---|
2987 | | - else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) |
---|
2988 | | - tp->linger2 = 0; |
---|
| 3326 | + WRITE_ONCE(tp->linger2, -1); |
---|
| 3327 | + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) |
---|
| 3328 | + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); |
---|
2989 | 3329 | else |
---|
2990 | | - tp->linger2 = val * HZ; |
---|
| 3330 | + WRITE_ONCE(tp->linger2, val * HZ); |
---|
2991 | 3331 | break; |
---|
2992 | 3332 | |
---|
2993 | 3333 | case TCP_DEFER_ACCEPT: |
---|
2994 | 3334 | /* Translate value in seconds to number of retransmits */ |
---|
2995 | | - icsk->icsk_accept_queue.rskq_defer_accept = |
---|
2996 | | - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, |
---|
2997 | | - TCP_RTO_MAX / HZ); |
---|
| 3335 | + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, |
---|
| 3336 | + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, |
---|
| 3337 | + TCP_RTO_MAX / HZ)); |
---|
2998 | 3338 | break; |
---|
2999 | 3339 | |
---|
3000 | 3340 | case TCP_WINDOW_CLAMP: |
---|
.. | .. |
---|
3010 | 3350 | break; |
---|
3011 | 3351 | |
---|
3012 | 3352 | case TCP_QUICKACK: |
---|
3013 | | - if (!val) { |
---|
3014 | | - icsk->icsk_ack.pingpong = 1; |
---|
3015 | | - } else { |
---|
3016 | | - icsk->icsk_ack.pingpong = 0; |
---|
3017 | | - if ((1 << sk->sk_state) & |
---|
3018 | | - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
---|
3019 | | - inet_csk_ack_scheduled(sk)) { |
---|
3020 | | - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
---|
3021 | | - tcp_cleanup_rbuf(sk, 1); |
---|
3022 | | - if (!(val & 1)) |
---|
3023 | | - icsk->icsk_ack.pingpong = 1; |
---|
3024 | | - } |
---|
3025 | | - } |
---|
| 3353 | + __tcp_sock_set_quickack(sk, val); |
---|
3026 | 3354 | break; |
---|
3027 | 3355 | |
---|
3028 | 3356 | #ifdef CONFIG_TCP_MD5SIG |
---|
.. | .. |
---|
3038 | 3366 | if (val < 0) |
---|
3039 | 3367 | err = -EINVAL; |
---|
3040 | 3368 | else |
---|
3041 | | - icsk->icsk_user_timeout = val; |
---|
| 3369 | + WRITE_ONCE(icsk->icsk_user_timeout, val); |
---|
3042 | 3370 | break; |
---|
3043 | 3371 | |
---|
3044 | 3372 | case TCP_FASTOPEN: |
---|
.. | .. |
---|
3054 | 3382 | case TCP_FASTOPEN_CONNECT: |
---|
3055 | 3383 | if (val > 1 || val < 0) { |
---|
3056 | 3384 | err = -EINVAL; |
---|
3057 | | - } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { |
---|
| 3385 | + } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & |
---|
| 3386 | + TFO_CLIENT_ENABLE) { |
---|
3058 | 3387 | if (sk->sk_state == TCP_CLOSE) |
---|
3059 | 3388 | tp->fastopen_connect = val; |
---|
3060 | 3389 | else |
---|
.. | .. |
---|
3081 | 3410 | err = tcp_repair_set_window(tp, optval, optlen); |
---|
3082 | 3411 | break; |
---|
3083 | 3412 | case TCP_NOTSENT_LOWAT: |
---|
3084 | | - tp->notsent_lowat = val; |
---|
| 3413 | + WRITE_ONCE(tp->notsent_lowat, val); |
---|
3085 | 3414 | sk->sk_write_space(sk); |
---|
3086 | 3415 | break; |
---|
3087 | 3416 | case TCP_INQ: |
---|
.. | .. |
---|
3089 | 3418 | err = -EINVAL; |
---|
3090 | 3419 | else |
---|
3091 | 3420 | tp->recvmsg_inq = val; |
---|
| 3421 | + break; |
---|
| 3422 | + case TCP_TX_DELAY: |
---|
| 3423 | + if (val) |
---|
| 3424 | + tcp_enable_tx_delay(); |
---|
| 3425 | + WRITE_ONCE(tp->tcp_tx_delay, val); |
---|
3092 | 3426 | break; |
---|
3093 | 3427 | default: |
---|
3094 | 3428 | err = -ENOPROTOOPT; |
---|
.. | .. |
---|
3099 | 3433 | return err; |
---|
3100 | 3434 | } |
---|
3101 | 3435 | |
---|
3102 | | -int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, |
---|
| 3436 | +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, |
---|
3103 | 3437 | unsigned int optlen) |
---|
3104 | 3438 | { |
---|
3105 | 3439 | const struct inet_connection_sock *icsk = inet_csk(sk); |
---|
.. | .. |
---|
3110 | 3444 | return do_tcp_setsockopt(sk, level, optname, optval, optlen); |
---|
3111 | 3445 | } |
---|
3112 | 3446 | EXPORT_SYMBOL(tcp_setsockopt); |
---|
3113 | | - |
---|
3114 | | -#ifdef CONFIG_COMPAT |
---|
3115 | | -int compat_tcp_setsockopt(struct sock *sk, int level, int optname, |
---|
3116 | | - char __user *optval, unsigned int optlen) |
---|
3117 | | -{ |
---|
3118 | | - if (level != SOL_TCP) |
---|
3119 | | - return inet_csk_compat_setsockopt(sk, level, optname, |
---|
3120 | | - optval, optlen); |
---|
3121 | | - return do_tcp_setsockopt(sk, level, optname, optval, optlen); |
---|
3122 | | -} |
---|
3123 | | -EXPORT_SYMBOL(compat_tcp_setsockopt); |
---|
3124 | | -#endif |
---|
3125 | 3447 | |
---|
3126 | 3448 | static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, |
---|
3127 | 3449 | struct tcp_info *info) |
---|
.. | .. |
---|
3147 | 3469 | { |
---|
3148 | 3470 | const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ |
---|
3149 | 3471 | const struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 3472 | + unsigned long rate; |
---|
3150 | 3473 | u32 now; |
---|
3151 | 3474 | u64 rate64; |
---|
3152 | 3475 | bool slow; |
---|
3153 | | - u32 rate; |
---|
3154 | 3476 | |
---|
3155 | 3477 | memset(info, 0, sizeof(*info)); |
---|
3156 | 3478 | if (sk->sk_type != SOCK_STREAM) |
---|
.. | .. |
---|
3160 | 3482 | |
---|
3161 | 3483 | /* Report meaningful fields for all TCP states, including listeners */ |
---|
3162 | 3484 | rate = READ_ONCE(sk->sk_pacing_rate); |
---|
3163 | | - rate64 = rate != ~0U ? rate : ~0ULL; |
---|
| 3485 | + rate64 = (rate != ~0UL) ? rate : ~0ULL; |
---|
3164 | 3486 | info->tcpi_pacing_rate = rate64; |
---|
3165 | 3487 | |
---|
3166 | 3488 | rate = READ_ONCE(sk->sk_max_pacing_rate); |
---|
3167 | | - rate64 = rate != ~0U ? rate : ~0ULL; |
---|
| 3489 | + rate64 = (rate != ~0UL) ? rate : ~0ULL; |
---|
3168 | 3490 | info->tcpi_max_pacing_rate = rate64; |
---|
3169 | 3491 | |
---|
3170 | 3492 | info->tcpi_reordering = tp->reordering; |
---|
.. | .. |
---|
3175 | 3497 | * tcpi_unacked -> Number of children ready for accept() |
---|
3176 | 3498 | * tcpi_sacked -> max backlog |
---|
3177 | 3499 | */ |
---|
3178 | | - info->tcpi_unacked = sk->sk_ack_backlog; |
---|
3179 | | - info->tcpi_sacked = sk->sk_max_ack_backlog; |
---|
| 3500 | + info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); |
---|
| 3501 | + info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); |
---|
3180 | 3502 | return; |
---|
3181 | 3503 | } |
---|
3182 | 3504 | |
---|
.. | .. |
---|
3254 | 3576 | info->tcpi_bytes_retrans = tp->bytes_retrans; |
---|
3255 | 3577 | info->tcpi_dsack_dups = tp->dsack_dups; |
---|
3256 | 3578 | info->tcpi_reord_seen = tp->reord_seen; |
---|
| 3579 | + info->tcpi_rcv_ooopack = tp->rcv_ooopack; |
---|
| 3580 | + info->tcpi_snd_wnd = tp->snd_wnd; |
---|
| 3581 | + info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; |
---|
3257 | 3582 | unlock_sock_fast(sk, slow); |
---|
3258 | 3583 | } |
---|
3259 | 3584 | EXPORT_SYMBOL_GPL(tcp_get_info); |
---|
.. | .. |
---|
3282 | 3607 | nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ |
---|
3283 | 3608 | nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ |
---|
3284 | 3609 | nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ |
---|
| 3610 | + nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ |
---|
| 3611 | + nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ |
---|
| 3612 | + nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ |
---|
| 3613 | + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ |
---|
3285 | 3614 | 0; |
---|
3286 | 3615 | } |
---|
3287 | 3616 | |
---|
3288 | | -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) |
---|
| 3617 | +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, |
---|
| 3618 | + const struct sk_buff *orig_skb) |
---|
3289 | 3619 | { |
---|
3290 | 3620 | const struct tcp_sock *tp = tcp_sk(sk); |
---|
3291 | 3621 | struct sk_buff *stats; |
---|
3292 | 3622 | struct tcp_info info; |
---|
| 3623 | + unsigned long rate; |
---|
3293 | 3624 | u64 rate64; |
---|
3294 | | - u32 rate; |
---|
3295 | 3625 | |
---|
3296 | 3626 | stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); |
---|
3297 | 3627 | if (!stats) |
---|
.. | .. |
---|
3310 | 3640 | tp->total_retrans, TCP_NLA_PAD); |
---|
3311 | 3641 | |
---|
3312 | 3642 | rate = READ_ONCE(sk->sk_pacing_rate); |
---|
3313 | | - rate64 = rate != ~0U ? rate : ~0ULL; |
---|
| 3643 | + rate64 = (rate != ~0UL) ? rate : ~0ULL; |
---|
3314 | 3644 | nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); |
---|
3315 | 3645 | |
---|
3316 | 3646 | rate64 = tcp_compute_delivery_rate(tp); |
---|
.. | .. |
---|
3335 | 3665 | TCP_NLA_PAD); |
---|
3336 | 3666 | nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); |
---|
3337 | 3667 | nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); |
---|
| 3668 | + nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); |
---|
| 3669 | + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); |
---|
| 3670 | + nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, |
---|
| 3671 | + max_t(int, 0, tp->write_seq - tp->snd_nxt)); |
---|
| 3672 | + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, |
---|
| 3673 | + TCP_NLA_PAD); |
---|
3338 | 3674 | |
---|
3339 | 3675 | return stats; |
---|
3340 | 3676 | } |
---|
.. | .. |
---|
3358 | 3694 | switch (optname) { |
---|
3359 | 3695 | case TCP_MAXSEG: |
---|
3360 | 3696 | val = tp->mss_cache; |
---|
3361 | | - if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) |
---|
| 3697 | + if (tp->rx_opt.user_mss && |
---|
| 3698 | + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) |
---|
3362 | 3699 | val = tp->rx_opt.user_mss; |
---|
3363 | 3700 | if (tp->repair) |
---|
3364 | 3701 | val = tp->rx_opt.mss_clamp; |
---|
.. | .. |
---|
3382 | 3719 | val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; |
---|
3383 | 3720 | break; |
---|
3384 | 3721 | case TCP_LINGER2: |
---|
3385 | | - val = tp->linger2; |
---|
| 3722 | + val = READ_ONCE(tp->linger2); |
---|
3386 | 3723 | if (val >= 0) |
---|
3387 | | - val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; |
---|
| 3724 | + val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; |
---|
3388 | 3725 | break; |
---|
3389 | 3726 | case TCP_DEFER_ACCEPT: |
---|
3390 | | - val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, |
---|
3391 | | - TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); |
---|
| 3727 | + val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); |
---|
| 3728 | + val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, |
---|
| 3729 | + TCP_RTO_MAX / HZ); |
---|
3392 | 3730 | break; |
---|
3393 | 3731 | case TCP_WINDOW_CLAMP: |
---|
3394 | 3732 | val = tp->window_clamp; |
---|
.. | .. |
---|
3429 | 3767 | return 0; |
---|
3430 | 3768 | } |
---|
3431 | 3769 | case TCP_QUICKACK: |
---|
3432 | | - val = !icsk->icsk_ack.pingpong; |
---|
| 3770 | + val = !inet_csk_in_pingpong_mode(sk); |
---|
3433 | 3771 | break; |
---|
3434 | 3772 | |
---|
3435 | 3773 | case TCP_CONGESTION: |
---|
.. | .. |
---|
3458 | 3796 | return 0; |
---|
3459 | 3797 | |
---|
3460 | 3798 | case TCP_FASTOPEN_KEY: { |
---|
3461 | | - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; |
---|
3462 | | - struct tcp_fastopen_context *ctx; |
---|
| 3799 | + u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; |
---|
| 3800 | + unsigned int key_len; |
---|
3463 | 3801 | |
---|
3464 | 3802 | if (get_user(len, optlen)) |
---|
3465 | 3803 | return -EFAULT; |
---|
3466 | 3804 | |
---|
3467 | | - rcu_read_lock(); |
---|
3468 | | - ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); |
---|
3469 | | - if (ctx) |
---|
3470 | | - memcpy(key, ctx->key, sizeof(key)); |
---|
3471 | | - else |
---|
3472 | | - len = 0; |
---|
3473 | | - rcu_read_unlock(); |
---|
3474 | | - |
---|
3475 | | - len = min_t(unsigned int, len, sizeof(key)); |
---|
| 3805 | + key_len = tcp_fastopen_get_cipher(net, icsk, key) * |
---|
| 3806 | + TCP_FASTOPEN_KEY_LENGTH; |
---|
| 3807 | + len = min_t(unsigned int, len, key_len); |
---|
3476 | 3808 | if (put_user(len, optlen)) |
---|
3477 | 3809 | return -EFAULT; |
---|
3478 | 3810 | if (copy_to_user(optval, key, len)) |
---|
.. | .. |
---|
3530 | 3862 | break; |
---|
3531 | 3863 | |
---|
3532 | 3864 | case TCP_USER_TIMEOUT: |
---|
3533 | | - val = icsk->icsk_user_timeout; |
---|
| 3865 | + val = READ_ONCE(icsk->icsk_user_timeout); |
---|
3534 | 3866 | break; |
---|
3535 | 3867 | |
---|
3536 | 3868 | case TCP_FASTOPEN: |
---|
3537 | | - val = icsk->icsk_accept_queue.fastopenq.max_qlen; |
---|
| 3869 | + val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); |
---|
3538 | 3870 | break; |
---|
3539 | 3871 | |
---|
3540 | 3872 | case TCP_FASTOPEN_CONNECT: |
---|
.. | .. |
---|
3545 | 3877 | val = tp->fastopen_no_cookie; |
---|
3546 | 3878 | break; |
---|
3547 | 3879 | |
---|
| 3880 | + case TCP_TX_DELAY: |
---|
| 3881 | + val = READ_ONCE(tp->tcp_tx_delay); |
---|
| 3882 | + break; |
---|
| 3883 | + |
---|
3548 | 3884 | case TCP_TIMESTAMP: |
---|
3549 | 3885 | val = tcp_time_stamp_raw() + tp->tsoffset; |
---|
3550 | 3886 | break; |
---|
3551 | 3887 | case TCP_NOTSENT_LOWAT: |
---|
3552 | | - val = tp->notsent_lowat; |
---|
| 3888 | + val = READ_ONCE(tp->notsent_lowat); |
---|
3553 | 3889 | break; |
---|
3554 | 3890 | case TCP_INQ: |
---|
3555 | 3891 | val = tp->recvmsg_inq; |
---|
.. | .. |
---|
3563 | 3899 | |
---|
3564 | 3900 | lock_sock(sk); |
---|
3565 | 3901 | if (tp->saved_syn) { |
---|
3566 | | - if (len < tp->saved_syn[0]) { |
---|
3567 | | - if (put_user(tp->saved_syn[0], optlen)) { |
---|
| 3902 | + if (len < tcp_saved_syn_len(tp->saved_syn)) { |
---|
| 3903 | + if (put_user(tcp_saved_syn_len(tp->saved_syn), |
---|
| 3904 | + optlen)) { |
---|
3568 | 3905 | release_sock(sk); |
---|
3569 | 3906 | return -EFAULT; |
---|
3570 | 3907 | } |
---|
3571 | 3908 | release_sock(sk); |
---|
3572 | 3909 | return -EINVAL; |
---|
3573 | 3910 | } |
---|
3574 | | - len = tp->saved_syn[0]; |
---|
| 3911 | + len = tcp_saved_syn_len(tp->saved_syn); |
---|
3575 | 3912 | if (put_user(len, optlen)) { |
---|
3576 | 3913 | release_sock(sk); |
---|
3577 | 3914 | return -EFAULT; |
---|
3578 | 3915 | } |
---|
3579 | | - if (copy_to_user(optval, tp->saved_syn + 1, len)) { |
---|
| 3916 | + if (copy_to_user(optval, tp->saved_syn->data, len)) { |
---|
3580 | 3917 | release_sock(sk); |
---|
3581 | 3918 | return -EFAULT; |
---|
3582 | 3919 | } |
---|
.. | .. |
---|
3592 | 3929 | } |
---|
3593 | 3930 | #ifdef CONFIG_MMU |
---|
3594 | 3931 | case TCP_ZEROCOPY_RECEIVE: { |
---|
3595 | | - struct tcp_zerocopy_receive zc; |
---|
| 3932 | + struct tcp_zerocopy_receive zc = {}; |
---|
3596 | 3933 | int err; |
---|
3597 | 3934 | |
---|
3598 | 3935 | if (get_user(len, optlen)) |
---|
3599 | 3936 | return -EFAULT; |
---|
3600 | | - if (len != sizeof(zc)) |
---|
| 3937 | + if (len < 0 || |
---|
| 3938 | + len < offsetofend(struct tcp_zerocopy_receive, length)) |
---|
3601 | 3939 | return -EINVAL; |
---|
| 3940 | + if (len > sizeof(zc)) { |
---|
| 3941 | + len = sizeof(zc); |
---|
| 3942 | + if (put_user(len, optlen)) |
---|
| 3943 | + return -EFAULT; |
---|
| 3944 | + } |
---|
3602 | 3945 | if (copy_from_user(&zc, optval, len)) |
---|
3603 | 3946 | return -EFAULT; |
---|
3604 | 3947 | lock_sock(sk); |
---|
3605 | 3948 | err = tcp_zerocopy_receive(sk, &zc); |
---|
3606 | 3949 | release_sock(sk); |
---|
| 3950 | + if (len >= offsetofend(struct tcp_zerocopy_receive, err)) |
---|
| 3951 | + goto zerocopy_rcv_sk_err; |
---|
| 3952 | + switch (len) { |
---|
| 3953 | + case offsetofend(struct tcp_zerocopy_receive, err): |
---|
| 3954 | + goto zerocopy_rcv_sk_err; |
---|
| 3955 | + case offsetofend(struct tcp_zerocopy_receive, inq): |
---|
| 3956 | + goto zerocopy_rcv_inq; |
---|
| 3957 | + case offsetofend(struct tcp_zerocopy_receive, length): |
---|
| 3958 | + default: |
---|
| 3959 | + goto zerocopy_rcv_out; |
---|
| 3960 | + } |
---|
| 3961 | +zerocopy_rcv_sk_err: |
---|
| 3962 | + if (!err) |
---|
| 3963 | + zc.err = sock_error(sk); |
---|
| 3964 | +zerocopy_rcv_inq: |
---|
| 3965 | + zc.inq = tcp_inq_hint(sk); |
---|
| 3966 | +zerocopy_rcv_out: |
---|
3607 | 3967 | if (!err && copy_to_user(optval, &zc, len)) |
---|
3608 | 3968 | err = -EFAULT; |
---|
3609 | 3969 | return err; |
---|
.. | .. |
---|
3631 | 3991 | return do_tcp_getsockopt(sk, level, optname, optval, optlen); |
---|
3632 | 3992 | } |
---|
3633 | 3993 | EXPORT_SYMBOL(tcp_getsockopt); |
---|
3634 | | - |
---|
3635 | | -#ifdef CONFIG_COMPAT |
---|
3636 | | -int compat_tcp_getsockopt(struct sock *sk, int level, int optname, |
---|
3637 | | - char __user *optval, int __user *optlen) |
---|
3638 | | -{ |
---|
3639 | | - if (level != SOL_TCP) |
---|
3640 | | - return inet_csk_compat_getsockopt(sk, level, optname, |
---|
3641 | | - optval, optlen); |
---|
3642 | | - return do_tcp_getsockopt(sk, level, optname, optval, optlen); |
---|
3643 | | -} |
---|
3644 | | -EXPORT_SYMBOL(compat_tcp_getsockopt); |
---|
3645 | | -#endif |
---|
3646 | 3994 | |
---|
3647 | 3995 | #ifdef CONFIG_TCP_MD5SIG |
---|
3648 | 3996 | static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); |
---|
.. | .. |
---|
3686 | 4034 | * to memory. See smp_rmb() in tcp_get_md5sig_pool() |
---|
3687 | 4035 | */ |
---|
3688 | 4036 | smp_wmb(); |
---|
3689 | | - tcp_md5sig_pool_populated = true; |
---|
| 4037 | + /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool() |
---|
| 4038 | + * and tcp_get_md5sig_pool(). |
---|
| 4039 | + */ |
---|
| 4040 | + WRITE_ONCE(tcp_md5sig_pool_populated, true); |
---|
3690 | 4041 | } |
---|
3691 | 4042 | |
---|
3692 | 4043 | bool tcp_alloc_md5sig_pool(void) |
---|
3693 | 4044 | { |
---|
3694 | | - if (unlikely(!tcp_md5sig_pool_populated)) { |
---|
| 4045 | + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ |
---|
| 4046 | + if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { |
---|
3695 | 4047 | mutex_lock(&tcp_md5sig_mutex); |
---|
3696 | 4048 | |
---|
3697 | | - if (!tcp_md5sig_pool_populated) |
---|
| 4049 | + if (!tcp_md5sig_pool_populated) { |
---|
3698 | 4050 | __tcp_alloc_md5sig_pool(); |
---|
| 4051 | + if (tcp_md5sig_pool_populated) |
---|
| 4052 | + static_branch_inc(&tcp_md5_needed); |
---|
| 4053 | + } |
---|
3699 | 4054 | |
---|
3700 | 4055 | mutex_unlock(&tcp_md5sig_mutex); |
---|
3701 | 4056 | } |
---|
3702 | | - return tcp_md5sig_pool_populated; |
---|
| 4057 | + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ |
---|
| 4058 | + return READ_ONCE(tcp_md5sig_pool_populated); |
---|
3703 | 4059 | } |
---|
3704 | 4060 | EXPORT_SYMBOL(tcp_alloc_md5sig_pool); |
---|
3705 | 4061 | |
---|
.. | .. |
---|
3715 | 4071 | { |
---|
3716 | 4072 | local_bh_disable(); |
---|
3717 | 4073 | |
---|
3718 | | - if (tcp_md5sig_pool_populated) { |
---|
| 4074 | + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ |
---|
| 4075 | + if (READ_ONCE(tcp_md5sig_pool_populated)) { |
---|
3719 | 4076 | /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ |
---|
3720 | 4077 | smp_rmb(); |
---|
3721 | 4078 | return this_cpu_ptr(&tcp_md5sig_pool); |
---|
.. | .. |
---|
3745 | 4102 | return 1; |
---|
3746 | 4103 | |
---|
3747 | 4104 | for (i = 0; i < shi->nr_frags; ++i) { |
---|
3748 | | - const struct skb_frag_struct *f = &shi->frags[i]; |
---|
3749 | | - unsigned int offset = f->page_offset; |
---|
| 4105 | + const skb_frag_t *f = &shi->frags[i]; |
---|
| 4106 | + unsigned int offset = skb_frag_off(f); |
---|
3750 | 4107 | struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); |
---|
3751 | 4108 | |
---|
3752 | 4109 | sg_set_page(&sg, page, skb_frag_size(f), |
---|
.. | .. |
---|
3772 | 4129 | sg_init_one(&sg, key->key, keylen); |
---|
3773 | 4130 | ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); |
---|
3774 | 4131 | |
---|
3775 | | - /* tcp_md5_do_add() might change key->key under us */ |
---|
3776 | | - return crypto_ahash_update(hp->md5_req); |
---|
| 4132 | + /* We use data_race() because tcp_md5_do_add() might change key->key under us */ |
---|
| 4133 | + return data_race(crypto_ahash_update(hp->md5_req)); |
---|
3777 | 4134 | } |
---|
3778 | 4135 | EXPORT_SYMBOL(tcp_md5_hash_key); |
---|
3779 | 4136 | |
---|
.. | .. |
---|
3781 | 4138 | |
---|
3782 | 4139 | void tcp_done(struct sock *sk) |
---|
3783 | 4140 | { |
---|
3784 | | - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; |
---|
| 4141 | + struct request_sock *req; |
---|
| 4142 | + |
---|
| 4143 | + /* We might be called with a new socket, after |
---|
| 4144 | + * inet_csk_prepare_forced_close() has been called |
---|
| 4145 | + * so we can not use lockdep_sock_is_held(sk) |
---|
| 4146 | + */ |
---|
| 4147 | + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); |
---|
3785 | 4148 | |
---|
3786 | 4149 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) |
---|
3787 | 4150 | TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
---|
.. | .. |
---|
3791 | 4154 | if (req) |
---|
3792 | 4155 | reqsk_fastopen_remove(sk, req, false); |
---|
3793 | 4156 | |
---|
3794 | | - sk->sk_shutdown = SHUTDOWN_MASK; |
---|
| 4157 | + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); |
---|
3795 | 4158 | |
---|
3796 | 4159 | if (!sock_flag(sk, SOCK_DEAD)) |
---|
3797 | 4160 | sk->sk_state_change(sk); |
---|
.. | .. |
---|
3880 | 4243 | |
---|
3881 | 4244 | BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); |
---|
3882 | 4245 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > |
---|
3883 | | - FIELD_SIZEOF(struct sk_buff, cb)); |
---|
| 4246 | + sizeof_field(struct sk_buff, cb)); |
---|
3884 | 4247 | |
---|
3885 | 4248 | percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); |
---|
3886 | 4249 | percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); |
---|
.. | .. |
---|
3954 | 4317 | tcp_metrics_init(); |
---|
3955 | 4318 | BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); |
---|
3956 | 4319 | tcp_tasklet_init(); |
---|
| 4320 | + mptcp_init(); |
---|
3957 | 4321 | } |
---|