hc
2024-05-14 bedbef8ad3e75a304af6361af235302bcc61d06b
kernel/net/ipv4/tcp.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -205,11 +206,6 @@
205206 * Hirokazu Takahashi : Use copy_from_user() instead of
206207 * csum_and_copy_from_user() if possible.
207208 *
208
- * This program is free software; you can redistribute it and/or
209
- * modify it under the terms of the GNU General Public License
210
- * as published by the Free Software Foundation; either version
211
- * 2 of the License, or(at your option) any later version.
212
- *
213209 * Description of States:
214210 *
215211 * TCP_SYN_SENT sent a connection request, waiting for ack
....@@ -262,7 +258,7 @@
262258 #include <linux/net.h>
263259 #include <linux/socket.h>
264260 #include <linux/random.h>
265
-#include <linux/bootmem.h>
261
+#include <linux/memblock.h>
266262 #include <linux/highmem.h>
267263 #include <linux/swap.h>
268264 #include <linux/cache.h>
....@@ -275,6 +271,7 @@
275271 #include <net/icmp.h>
276272 #include <net/inet_common.h>
277273 #include <net/tcp.h>
274
+#include <net/mptcp.h>
278275 #include <net/xfrm.h>
279276 #include <net/ip.h>
280277 #include <net/sock.h>
....@@ -282,6 +279,8 @@
282279 #include <linux/uaccess.h>
283280 #include <asm/ioctls.h>
284281 #include <net/busy_poll.h>
282
+
283
+#include <trace/hooks/ipv4.h>
285284
286285 struct percpu_counter tcp_orphan_count;
287286 EXPORT_SYMBOL_GPL(tcp_orphan_count);
....@@ -320,6 +319,11 @@
320319 */
321320 unsigned long tcp_memory_pressure __read_mostly;
322321 EXPORT_SYMBOL_GPL(tcp_memory_pressure);
322
+
323
+DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
324
+EXPORT_SYMBOL(tcp_rx_skb_cache_key);
325
+
326
+DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
323327
324328 void tcp_enter_memory_pressure(struct sock *sk)
325329 {
....@@ -416,6 +420,8 @@
416420 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
417421
418422 icsk->icsk_rto = TCP_TIMEOUT_INIT;
423
+ icsk->icsk_rto_min = TCP_RTO_MIN;
424
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
419425 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
420426 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
421427
....@@ -428,6 +434,7 @@
428434
429435 /* There's a bubble in the pipe until at least the first ACK. */
430436 tp->app_limited = ~0U;
437
+ tp->rate_app_limited = 1;
431438
432439 /* See draft-stevens-tcpca-spec-01 for discussion of the
433440 * initialization of these values.
....@@ -436,38 +443,24 @@
436443 tp->snd_cwnd_clamp = ~0;
437444 tp->mss_cache = TCP_MSS_DEFAULT;
438445
439
- tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
446
+ tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
440447 tcp_assign_congestion_control(sk);
441448
442449 tp->tsoffset = 0;
443450 tp->rack.reo_wnd_steps = 1;
444
-
445
- sk->sk_state = TCP_CLOSE;
446451
447452 sk->sk_write_space = sk_stream_write_space;
448453 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
449454
450455 icsk->icsk_sync_mss = tcp_sync_mss;
451456
452
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
453
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
457
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
458
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
454459
455460 sk_sockets_allocated_inc(sk);
456461 sk->sk_route_forced_caps = NETIF_F_GSO;
457462 }
458463 EXPORT_SYMBOL(tcp_init_sock);
459
-
460
-void tcp_init_transfer(struct sock *sk, int bpf_op)
461
-{
462
- struct inet_connection_sock *icsk = inet_csk(sk);
463
-
464
- tcp_mtup_init(sk);
465
- icsk->icsk_af_ops->rebuild_header(sk);
466
- tcp_init_metrics(sk);
467
- tcp_call_bpf(sk, bpf_op, 0, NULL);
468
- tcp_init_congestion_control(sk);
469
- tcp_init_buffer_space(sk);
470
-}
471464
472465 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
473466 {
....@@ -515,6 +508,7 @@
515508 __poll_t mask;
516509 struct sock *sk = sock->sk;
517510 const struct tcp_sock *tp = tcp_sk(sk);
511
+ u8 shutdown;
518512 int state;
519513
520514 sock_poll_wait(file, sock, wait);
....@@ -557,17 +551,18 @@
557551 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
558552 * blocking on fresh not-connected or disconnected socket. --ANK
559553 */
560
- if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
554
+ shutdown = READ_ONCE(sk->sk_shutdown);
555
+ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
561556 mask |= EPOLLHUP;
562
- if (sk->sk_shutdown & RCV_SHUTDOWN)
557
+ if (shutdown & RCV_SHUTDOWN)
563558 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
564559
565560 /* Connected or passive Fast Open socket? */
566561 if (state != TCP_SYN_SENT &&
567
- (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
562
+ (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
568563 int target = sock_rcvlowat(sk, 0, INT_MAX);
569564
570
- if (tp->urg_seq == READ_ONCE(tp->copied_seq) &&
565
+ if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
571566 !sock_flag(sk, SOCK_URGINLINE) &&
572567 tp->urg_data)
573568 target++;
....@@ -575,8 +570,8 @@
575570 if (tcp_stream_is_readable(tp, target, sk))
576571 mask |= EPOLLIN | EPOLLRDNORM;
577572
578
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
579
- if (sk_stream_is_writeable(sk)) {
573
+ if (!(shutdown & SEND_SHUTDOWN)) {
574
+ if (__sk_stream_is_writeable(sk, 1)) {
580575 mask |= EPOLLOUT | EPOLLWRNORM;
581576 } else { /* send SIGIO later */
582577 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
....@@ -588,7 +583,7 @@
588583 * pairs with the input side.
589584 */
590585 smp_mb__after_atomic();
591
- if (sk_stream_is_writeable(sk))
586
+ if (__sk_stream_is_writeable(sk, 1))
592587 mask |= EPOLLOUT | EPOLLWRNORM;
593588 }
594589 } else
....@@ -628,7 +623,8 @@
628623 unlock_sock_fast(sk, slow);
629624 break;
630625 case SIOCATMARK:
631
- answ = tp->urg_data && tp->urg_seq == READ_ONCE(tp->copied_seq);
626
+ answ = tp->urg_data &&
627
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
632628 break;
633629 case SIOCOUTQ:
634630 if (sk->sk_state == TCP_LISTEN)
....@@ -646,7 +642,8 @@
646642 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
647643 answ = 0;
648644 else
649
- answ = READ_ONCE(tp->write_seq) - tp->snd_nxt;
645
+ answ = READ_ONCE(tp->write_seq) -
646
+ READ_ONCE(tp->snd_nxt);
650647 break;
651648 default:
652649 return -ENOIOCTLCMD;
....@@ -678,7 +675,7 @@
678675 tcb->sacked = 0;
679676 __skb_header_release(skb);
680677 tcp_add_write_queue_tail(sk, skb);
681
- sk->sk_wmem_queued += skb->truesize;
678
+ sk_wmem_queued_add(sk, skb->truesize);
682679 sk_mem_charge(sk, skb->truesize);
683680 if (tp->nonagle & TCP_NAGLE_PUSH)
684681 tp->nonagle &= ~TCP_NAGLE_PUSH;
....@@ -706,13 +703,13 @@
706703 int size_goal)
707704 {
708705 return skb->len < size_goal &&
709
- sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
706
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
710707 !tcp_rtx_queue_empty(sk) &&
711708 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
712709 }
713710
714
-static void tcp_push(struct sock *sk, int flags, int mss_now,
715
- int nonagle, int size_goal)
711
+void tcp_push(struct sock *sk, int flags, int mss_now,
712
+ int nonagle, int size_goal)
716713 {
717714 struct tcp_sock *tp = tcp_sk(sk);
718715 struct sk_buff *skb;
....@@ -875,6 +872,18 @@
875872 {
876873 struct sk_buff *skb;
877874
875
+ if (likely(!size)) {
876
+ skb = sk->sk_tx_skb_cache;
877
+ if (skb) {
878
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
879
+ sk->sk_tx_skb_cache = NULL;
880
+ pskb_trim(skb, 0);
881
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
882
+ skb_shinfo(skb)->tx_flags = 0;
883
+ memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
884
+ return skb;
885
+ }
886
+ }
878887 /* The TCP header must be at least 32-bit aligned. */
879888 size = ALIGN(size, 4);
880889
....@@ -934,7 +943,7 @@
934943 return max(size_goal, mss_now);
935944 }
936945
937
-static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
946
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
938947 {
939948 int mss_now;
940949
....@@ -969,6 +978,11 @@
969978 ssize_t copied;
970979 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
971980
981
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
982
+ WARN_ONCE(!sendpage_ok(page),
983
+ "page must not be a Slab one and have page_count > 0"))
984
+ return -EINVAL;
985
+
972986 /* Wait for a connection to finish. One exception is TCP Fast Open
973987 * (passive side) where data is allowed to be sent before a connection
974988 * is fully established.
....@@ -998,13 +1012,16 @@
9981012 !tcp_skb_can_collapse_to(skb)) {
9991013 new_segment:
10001014 if (!sk_stream_memory_free(sk))
1001
- goto wait_for_sndbuf;
1015
+ goto wait_for_space;
10021016
10031017 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
10041018 tcp_rtx_and_write_queues_empty(sk));
10051019 if (!skb)
1006
- goto wait_for_memory;
1020
+ goto wait_for_space;
10071021
1022
+#ifdef CONFIG_TLS_DEVICE
1023
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
1024
+#endif
10081025 skb_entail(sk, skb);
10091026 copy = size_goal;
10101027 }
....@@ -1019,7 +1036,7 @@
10191036 goto new_segment;
10201037 }
10211038 if (!sk_wmem_schedule(sk, copy))
1022
- goto wait_for_memory;
1039
+ goto wait_for_space;
10231040
10241041 if (can_coalesce) {
10251042 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
....@@ -1034,7 +1051,7 @@
10341051 skb->len += copy;
10351052 skb->data_len += copy;
10361053 skb->truesize += copy;
1037
- sk->sk_wmem_queued += copy;
1054
+ sk_wmem_queued_add(sk, copy);
10381055 sk_mem_charge(sk, copy);
10391056 skb->ip_summed = CHECKSUM_PARTIAL;
10401057 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
....@@ -1060,9 +1077,8 @@
10601077 tcp_push_one(sk, mss_now);
10611078 continue;
10621079
1063
-wait_for_sndbuf:
1080
+wait_for_space:
10641081 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1065
-wait_for_memory:
10661082 tcp_push(sk, flags & ~MSG_MORE, mss_now,
10671083 TCP_NAGLE_PUSH, size_goal);
10681084
....@@ -1120,30 +1136,6 @@
11201136 }
11211137 EXPORT_SYMBOL(tcp_sendpage);
11221138
1123
-/* Do not bother using a page frag for very small frames.
1124
- * But use this heuristic only for the first skb in write queue.
1125
- *
1126
- * Having no payload in skb->head allows better SACK shifting
1127
- * in tcp_shift_skb_data(), reducing sack/rack overhead, because
1128
- * write queue has less skbs.
1129
- * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
1130
- * This also speeds up tso_fragment(), since it wont fallback
1131
- * to tcp_fragment().
1132
- */
1133
-static int linear_payload_sz(bool first_skb)
1134
-{
1135
- if (first_skb)
1136
- return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1137
- return 0;
1138
-}
1139
-
1140
-static int select_size(bool first_skb, bool zc)
1141
-{
1142
- if (zc)
1143
- return 0;
1144
- return linear_payload_sz(first_skb);
1145
-}
1146
-
11471139 void tcp_free_fastopen_req(struct tcp_sock *tp)
11481140 {
11491141 if (tp->fastopen_req) {
....@@ -1153,14 +1145,16 @@
11531145 }
11541146
11551147 static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1156
- int *copied, size_t size)
1148
+ int *copied, size_t size,
1149
+ struct ubuf_info *uarg)
11571150 {
11581151 struct tcp_sock *tp = tcp_sk(sk);
11591152 struct inet_sock *inet = inet_sk(sk);
11601153 struct sockaddr *uaddr = msg->msg_name;
11611154 int err, flags;
11621155
1163
- if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1156
+ if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
1157
+ TFO_CLIENT_ENABLE) ||
11641158 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
11651159 uaddr->sa_family == AF_UNSPEC))
11661160 return -EOPNOTSUPP;
....@@ -1173,6 +1167,7 @@
11731167 return -ENOBUFS;
11741168 tp->fastopen_req->data = msg;
11751169 tp->fastopen_req->size = size;
1170
+ tp->fastopen_req->uarg = uarg;
11761171
11771172 if (inet->defer_connect) {
11781173 err = tcp_connect(sk);
....@@ -1205,18 +1200,14 @@
12051200 struct sockcm_cookie sockc;
12061201 int flags, err, copied = 0;
12071202 int mss_now = 0, size_goal, copied_syn = 0;
1208
- bool process_backlog = false;
1203
+ int process_backlog = 0;
12091204 bool zc = false;
12101205 long timeo;
12111206
1207
+ trace_android_rvh_tcp_sendmsg_locked(sk, size);
12121208 flags = msg->msg_flags;
12131209
12141210 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1215
- if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
1216
- err = -EINVAL;
1217
- goto out_err;
1218
- }
1219
-
12201211 skb = tcp_write_queue_tail(sk);
12211212 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
12221213 if (!uarg) {
....@@ -1231,7 +1222,7 @@
12311222
12321223 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
12331224 !tp->repair) {
1234
- err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1225
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
12351226 if (err == -EINPROGRESS && copied_syn > 0)
12361227 goto out;
12371228 else if (err)
....@@ -1297,31 +1288,30 @@
12971288
12981289 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
12991290 bool first_skb;
1300
- int linear;
13011291
13021292 new_segment:
13031293 if (!sk_stream_memory_free(sk))
1304
- goto wait_for_sndbuf;
1294
+ goto wait_for_space;
13051295
1306
- if (process_backlog && sk_flush_backlog(sk)) {
1307
- process_backlog = false;
1308
- goto restart;
1296
+ if (unlikely(process_backlog >= 16)) {
1297
+ process_backlog = 0;
1298
+ if (sk_flush_backlog(sk))
1299
+ goto restart;
13091300 }
13101301 first_skb = tcp_rtx_and_write_queues_empty(sk);
1311
- linear = select_size(first_skb, zc);
1312
- skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
1302
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
13131303 first_skb);
13141304 if (!skb)
1315
- goto wait_for_memory;
1305
+ goto wait_for_space;
13161306
1317
- process_backlog = true;
1307
+ process_backlog++;
13181308 skb->ip_summed = CHECKSUM_PARTIAL;
13191309
13201310 skb_entail(sk, skb);
13211311 copy = size_goal;
13221312
13231313 /* All packets are restored as if they have
1324
- * already been sent. skb_mstamp isn't set to
1314
+ * already been sent. skb_mstamp_ns isn't set to
13251315 * avoid wrong rtt estimation.
13261316 */
13271317 if (tp->repair)
....@@ -1345,7 +1335,7 @@
13451335 struct page_frag *pfrag = sk_page_frag(sk);
13461336
13471337 if (!sk_page_frag_refill(sk, pfrag))
1348
- goto wait_for_memory;
1338
+ goto wait_for_space;
13491339
13501340 if (!skb_can_coalesce(skb, i, pfrag->page,
13511341 pfrag->offset)) {
....@@ -1359,7 +1349,7 @@
13591349 copy = min_t(int, copy, pfrag->size - pfrag->offset);
13601350
13611351 if (!sk_wmem_schedule(sk, copy))
1362
- goto wait_for_memory;
1352
+ goto wait_for_space;
13631353
13641354 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
13651355 pfrag->page,
....@@ -1378,6 +1368,9 @@
13781368 }
13791369 pfrag->offset += copy;
13801370 } else {
1371
+ if (!sk_wmem_schedule(sk, copy))
1372
+ goto wait_for_space;
1373
+
13811374 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
13821375 if (err == -EMSGSIZE || err == -EEXIST) {
13831376 tcp_mark_push(tp, skb);
....@@ -1412,9 +1405,8 @@
14121405 tcp_push_one(sk, mss_now);
14131406 continue;
14141407
1415
-wait_for_sndbuf:
1408
+wait_for_space:
14161409 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1417
-wait_for_memory:
14181410 if (copied)
14191411 tcp_push(sk, flags & ~MSG_MORE, mss_now,
14201412 TCP_NAGLE_PUSH, size_goal);
....@@ -1443,7 +1435,7 @@
14431435 if (copied + copied_syn)
14441436 goto out;
14451437 out_err:
1446
- sock_zerocopy_put_abort(uarg);
1438
+ sock_zerocopy_put_abort(uarg, true);
14471439 err = sk_stream_error(sk, flags, err);
14481440 /* make sure we wake any epoll edge trigger waiter */
14491441 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
....@@ -1546,7 +1538,7 @@
15461538 * calculation of whether or not we must ACK for the sake of
15471539 * a window update.
15481540 */
1549
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1541
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
15501542 {
15511543 struct tcp_sock *tp = tcp_sk(sk);
15521544 bool time_to_ack = false;
....@@ -1559,10 +1551,8 @@
15591551
15601552 if (inet_csk_ack_scheduled(sk)) {
15611553 const struct inet_connection_sock *icsk = inet_csk(sk);
1562
- /* Delayed ACKs frequently hit locked sockets during bulk
1563
- * receive. */
1564
- if (icsk->icsk_ack.blocked ||
1565
- /* Once-per-two-segments ACK was not sent by tcp_input.c */
1554
+
1555
+ if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
15661556 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
15671557 /*
15681558 * If this read emptied read buffer, we send ACK, if
....@@ -1573,7 +1563,7 @@
15731563 (copied > 0 &&
15741564 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
15751565 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1576
- !icsk->icsk_ack.pingpong)) &&
1566
+ !inet_csk_in_pingpong_mode(sk))) &&
15771567 !atomic_read(&sk->sk_rmem_alloc)))
15781568 time_to_ack = true;
15791569 }
....@@ -1669,11 +1659,13 @@
16691659 if (!copied)
16701660 copied = used;
16711661 break;
1672
- } else if (used <= len) {
1673
- seq += used;
1674
- copied += used;
1675
- offset += used;
16761662 }
1663
+ if (WARN_ON_ONCE(used > len))
1664
+ used = len;
1665
+ seq += used;
1666
+ copied += used;
1667
+ offset += used;
1668
+
16771669 /* If recv_actor drops the lock (e.g. TCP splice
16781670 * receive) the skb pointer might be invalid when
16791671 * getting here: tcp_collapse might have deleted it
....@@ -1725,9 +1717,9 @@
17251717 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
17261718 cap = sk->sk_rcvbuf >> 1;
17271719 else
1728
- cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1720
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
17291721 val = min(val, cap);
1730
- sk->sk_rcvlowat = val ? : 1;
1722
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
17311723
17321724 /* Check if we need to signal EPOLLIN right now */
17331725 tcp_data_ready(sk);
....@@ -1737,7 +1729,7 @@
17371729
17381730 val <<= 1;
17391731 if (val > sk->sk_rcvbuf) {
1740
- sk->sk_rcvbuf = val;
1732
+ WRITE_ONCE(sk->sk_rcvbuf, val);
17411733 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
17421734 }
17431735 return 0;
....@@ -1755,7 +1747,7 @@
17551747 return -EPERM;
17561748 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
17571749
1758
- /* Instruct vm_insert_page() to not down_read(mmap_sem) */
1750
+ /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
17591751 vma->vm_flags |= VM_MIXEDMAP;
17601752
17611753 vma->vm_ops = &tcp_vm_ops;
....@@ -1763,16 +1755,126 @@
17631755 }
17641756 EXPORT_SYMBOL(tcp_mmap);
17651757
1758
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1759
+ u32 *offset_frag)
1760
+{
1761
+ skb_frag_t *frag;
1762
+
1763
+ if (unlikely(offset_skb >= skb->len))
1764
+ return NULL;
1765
+
1766
+ offset_skb -= skb_headlen(skb);
1767
+ if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1768
+ return NULL;
1769
+
1770
+ frag = skb_shinfo(skb)->frags;
1771
+ while (offset_skb) {
1772
+ if (skb_frag_size(frag) > offset_skb) {
1773
+ *offset_frag = offset_skb;
1774
+ return frag;
1775
+ }
1776
+ offset_skb -= skb_frag_size(frag);
1777
+ ++frag;
1778
+ }
1779
+ *offset_frag = 0;
1780
+ return frag;
1781
+}
1782
+
1783
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1784
+ struct sk_buff *skb, u32 copylen,
1785
+ u32 *offset, u32 *seq)
1786
+{
1787
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
1788
+ struct msghdr msg = {};
1789
+ struct iovec iov;
1790
+ int err;
1791
+
1792
+ if (copy_address != zc->copybuf_address)
1793
+ return -EINVAL;
1794
+
1795
+ err = import_single_range(READ, (void __user *)copy_address,
1796
+ copylen, &iov, &msg.msg_iter);
1797
+ if (err)
1798
+ return err;
1799
+ err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1800
+ if (err)
1801
+ return err;
1802
+ zc->recv_skip_hint -= copylen;
1803
+ *offset += copylen;
1804
+ *seq += copylen;
1805
+ return (__s32)copylen;
1806
+}
1807
+
1808
+static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
1809
+ struct sock *sk,
1810
+ struct sk_buff *skb,
1811
+ u32 *seq,
1812
+ s32 copybuf_len)
1813
+{
1814
+ u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1815
+
1816
+ if (!copylen)
1817
+ return 0;
1818
+ /* skb is null if inq < PAGE_SIZE. */
1819
+ if (skb)
1820
+ offset = *seq - TCP_SKB_CB(skb)->seq;
1821
+ else
1822
+ skb = tcp_recv_skb(sk, *seq, &offset);
1823
+
1824
+ zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1825
+ seq);
1826
+ return zc->copybuf_len < 0 ? 0 : copylen;
1827
+}
1828
+
1829
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1830
+ struct page **pages,
1831
+ unsigned long pages_to_map,
1832
+ unsigned long *insert_addr,
1833
+ u32 *length_with_pending,
1834
+ u32 *seq,
1835
+ struct tcp_zerocopy_receive *zc)
1836
+{
1837
+ unsigned long pages_remaining = pages_to_map;
1838
+ int bytes_mapped;
1839
+ int ret;
1840
+
1841
+ ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
1842
+ bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
1843
+ /* Even if vm_insert_pages fails, it may have partially succeeded in
1844
+ * mapping (some but not all of the pages).
1845
+ */
1846
+ *seq += bytes_mapped;
1847
+ *insert_addr += bytes_mapped;
1848
+ if (ret) {
1849
+ /* But if vm_insert_pages did fail, we have to unroll some state
1850
+ * we speculatively touched before.
1851
+ */
1852
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1853
+ *length_with_pending -= bytes_not_mapped;
1854
+ zc->recv_skip_hint += bytes_not_mapped;
1855
+ }
1856
+ return ret;
1857
+}
1858
+
17661859 static int tcp_zerocopy_receive(struct sock *sk,
17671860 struct tcp_zerocopy_receive *zc)
17681861 {
1862
+ u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
17691863 unsigned long address = (unsigned long)zc->address;
1864
+ s32 copybuf_len = zc->copybuf_len;
1865
+ struct tcp_sock *tp = tcp_sk(sk);
1866
+ #define PAGE_BATCH_SIZE 8
1867
+ struct page *pages[PAGE_BATCH_SIZE];
17701868 const skb_frag_t *frags = NULL;
1771
- u32 length = 0, seq, offset;
17721869 struct vm_area_struct *vma;
17731870 struct sk_buff *skb = NULL;
1774
- struct tcp_sock *tp;
1871
+ unsigned long pg_idx = 0;
1872
+ unsigned long curr_addr;
1873
+ u32 seq = tp->copied_seq;
1874
+ int inq = tcp_inq(sk);
17751875 int ret;
1876
+
1877
+ zc->copybuf_len = 0;
17761878
17771879 if (address & (PAGE_SIZE - 1) || address != zc->address)
17781880 return -EINVAL;
....@@ -1782,65 +1884,98 @@
17821884
17831885 sock_rps_record_flow(sk);
17841886
1785
- down_read(&current->mm->mmap_sem);
1887
+ mmap_read_lock(current->mm);
17861888
17871889 vma = find_vma(current->mm, address);
17881890 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
1789
- up_read(&current->mm->mmap_sem);
1891
+ mmap_read_unlock(current->mm);
17901892 return -EINVAL;
17911893 }
1792
- zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1793
-
1794
- tp = tcp_sk(sk);
1795
- seq = tp->copied_seq;
1796
- zc->length = min_t(u32, zc->length, tcp_inq(sk));
1797
- zc->length &= ~(PAGE_SIZE - 1);
1798
-
1799
- zap_page_range(vma, address, zc->length);
1800
-
1801
- zc->recv_skip_hint = 0;
1894
+ vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
1895
+ avail_len = min_t(u32, vma_len, inq);
1896
+ aligned_len = avail_len & ~(PAGE_SIZE - 1);
1897
+ if (aligned_len) {
1898
+ zap_page_range(vma, address, aligned_len);
1899
+ zc->length = aligned_len;
1900
+ zc->recv_skip_hint = 0;
1901
+ } else {
1902
+ zc->length = avail_len;
1903
+ zc->recv_skip_hint = avail_len;
1904
+ }
18021905 ret = 0;
1906
+ curr_addr = address;
18031907 while (length + PAGE_SIZE <= zc->length) {
18041908 if (zc->recv_skip_hint < PAGE_SIZE) {
1909
+ u32 offset_frag;
1910
+
1911
+ /* If we're here, finish the current batch. */
1912
+ if (pg_idx) {
1913
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
1914
+ pg_idx,
1915
+ &curr_addr,
1916
+ &length,
1917
+ &seq, zc);
1918
+ if (ret)
1919
+ goto out;
1920
+ pg_idx = 0;
1921
+ }
18051922 if (skb) {
1923
+ if (zc->recv_skip_hint > 0)
1924
+ break;
18061925 skb = skb->next;
18071926 offset = seq - TCP_SKB_CB(skb)->seq;
18081927 } else {
18091928 skb = tcp_recv_skb(sk, seq, &offset);
18101929 }
1811
-
18121930 zc->recv_skip_hint = skb->len - offset;
1813
- offset -= skb_headlen(skb);
1814
- if ((int)offset < 0 || skb_has_frag_list(skb))
1931
+ frags = skb_advance_to_frag(skb, offset, &offset_frag);
1932
+ if (!frags || offset_frag)
18151933 break;
1816
- frags = skb_shinfo(skb)->frags;
1817
- while (offset) {
1818
- if (frags->size > offset)
1819
- goto out;
1820
- offset -= frags->size;
1934
+ }
1935
+ if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
1936
+ int remaining = zc->recv_skip_hint;
1937
+
1938
+ while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
1939
+ skb_frag_off(frags))) {
1940
+ remaining -= skb_frag_size(frags);
18211941 frags++;
18221942 }
1943
+ zc->recv_skip_hint -= remaining;
1944
+ break;
18231945 }
1824
- if (frags->size != PAGE_SIZE || frags->page_offset)
1825
- break;
1826
- ret = vm_insert_page(vma, address + length,
1827
- skb_frag_page(frags));
1828
- if (ret)
1829
- break;
1946
+ pages[pg_idx] = skb_frag_page(frags);
1947
+ pg_idx++;
18301948 length += PAGE_SIZE;
1831
- seq += PAGE_SIZE;
18321949 zc->recv_skip_hint -= PAGE_SIZE;
18331950 frags++;
1951
+ if (pg_idx == PAGE_BATCH_SIZE) {
1952
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1953
+ &curr_addr, &length,
1954
+ &seq, zc);
1955
+ if (ret)
1956
+ goto out;
1957
+ pg_idx = 0;
1958
+ }
1959
+ }
1960
+ if (pg_idx) {
1961
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1962
+ &curr_addr, &length, &seq,
1963
+ zc);
18341964 }
18351965 out:
1836
- up_read(&current->mm->mmap_sem);
1837
- if (length) {
1966
+ mmap_read_unlock(current->mm);
1967
+ /* Try to copy straggler data. */
1968
+ if (!ret)
1969
+ copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
1970
+ copybuf_len);
1971
+
1972
+ if (length + copylen) {
18381973 WRITE_ONCE(tp->copied_seq, seq);
18391974 tcp_rcv_space_adjust(sk);
18401975
18411976 /* Clean up data we have read: This will do ACK frames. */
18421977 tcp_recv_skb(sk, seq, &offset);
1843
- tcp_cleanup_rbuf(sk, length);
1978
+ tcp_cleanup_rbuf(sk, length + copylen);
18441979 ret = 0;
18451980 if (length == zc->length)
18461981 zc->recv_skip_hint = 0;
....@@ -1854,57 +1989,82 @@
18541989 #endif
18551990
18561991 static void tcp_update_recv_tstamps(struct sk_buff *skb,
1857
- struct scm_timestamping *tss)
1992
+ struct scm_timestamping_internal *tss)
18581993 {
18591994 if (skb->tstamp)
1860
- tss->ts[0] = ktime_to_timespec(skb->tstamp);
1995
+ tss->ts[0] = ktime_to_timespec64(skb->tstamp);
18611996 else
1862
- tss->ts[0] = (struct timespec) {0};
1997
+ tss->ts[0] = (struct timespec64) {0};
18631998
18641999 if (skb_hwtstamps(skb)->hwtstamp)
1865
- tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
2000
+ tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
18662001 else
1867
- tss->ts[2] = (struct timespec) {0};
2002
+ tss->ts[2] = (struct timespec64) {0};
18682003 }
18692004
18702005 /* Similar to __sock_recv_timestamp, but does not require an skb */
18712006 static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1872
- struct scm_timestamping *tss)
2007
+ struct scm_timestamping_internal *tss)
18732008 {
1874
- struct timeval tv;
2009
+ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
18752010 bool has_timestamping = false;
18762011
18772012 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
18782013 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
18792014 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
1880
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
1881
- sizeof(tss->ts[0]), &tss->ts[0]);
2015
+ if (new_tstamp) {
2016
+ struct __kernel_timespec kts = {
2017
+ .tv_sec = tss->ts[0].tv_sec,
2018
+ .tv_nsec = tss->ts[0].tv_nsec,
2019
+ };
2020
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2021
+ sizeof(kts), &kts);
2022
+ } else {
2023
+ struct __kernel_old_timespec ts_old = {
2024
+ .tv_sec = tss->ts[0].tv_sec,
2025
+ .tv_nsec = tss->ts[0].tv_nsec,
2026
+ };
2027
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2028
+ sizeof(ts_old), &ts_old);
2029
+ }
18822030 } else {
1883
- tv.tv_sec = tss->ts[0].tv_sec;
1884
- tv.tv_usec = tss->ts[0].tv_nsec / 1000;
1885
-
1886
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
1887
- sizeof(tv), &tv);
2031
+ if (new_tstamp) {
2032
+ struct __kernel_sock_timeval stv = {
2033
+ .tv_sec = tss->ts[0].tv_sec,
2034
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
2035
+ };
2036
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2037
+ sizeof(stv), &stv);
2038
+ } else {
2039
+ struct __kernel_old_timeval tv = {
2040
+ .tv_sec = tss->ts[0].tv_sec,
2041
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
2042
+ };
2043
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2044
+ sizeof(tv), &tv);
2045
+ }
18882046 }
18892047 }
18902048
18912049 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
18922050 has_timestamping = true;
18932051 else
1894
- tss->ts[0] = (struct timespec) {0};
2052
+ tss->ts[0] = (struct timespec64) {0};
18952053 }
18962054
18972055 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
18982056 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
18992057 has_timestamping = true;
19002058 else
1901
- tss->ts[2] = (struct timespec) {0};
2059
+ tss->ts[2] = (struct timespec64) {0};
19022060 }
19032061
19042062 if (has_timestamping) {
1905
- tss->ts[1] = (struct timespec) {0};
1906
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
1907
- sizeof(*tss), tss);
2063
+ tss->ts[1] = (struct timespec64) {0};
2064
+ if (sock_flag(sk, SOCK_TSTAMP_NEW))
2065
+ put_cmsg_scm_timestamping64(msg, tss);
2066
+ else
2067
+ put_cmsg_scm_timestamping(msg, tss);
19082068 }
19092069 }
19102070
....@@ -1950,12 +2110,12 @@
19502110 long timeo;
19512111 struct sk_buff *skb, *last;
19522112 u32 urg_hole = 0;
1953
- struct scm_timestamping tss;
1954
- bool has_tss = false;
1955
- bool has_cmsg;
2113
+ struct scm_timestamping_internal tss;
2114
+ int cmsg_flags;
19562115
19572116 if (unlikely(flags & MSG_ERRQUEUE))
19582117 return inet_recv_error(sk, msg, len, addr_len);
2118
+ trace_android_rvh_tcp_recvmsg(sk);
19592119
19602120 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
19612121 (sk->sk_state == TCP_ESTABLISHED))
....@@ -1967,7 +2127,7 @@
19672127 if (sk->sk_state == TCP_LISTEN)
19682128 goto out;
19692129
1970
- has_cmsg = tp->recvmsg_inq;
2130
+ cmsg_flags = tp->recvmsg_inq ? 1 : 0;
19712131 timeo = sock_rcvtimeo(sk, nonblock);
19722132
19732133 /* Urgent data needs to be handled specially. */
....@@ -2100,7 +2260,7 @@
21002260 }
21012261 continue;
21022262
2103
- found_ok_skb:
2263
+found_ok_skb:
21042264 /* Ok so how much can we use? */
21052265 used = skb->len - offset;
21062266 if (len < used)
....@@ -2148,8 +2308,7 @@
21482308
21492309 if (TCP_SKB_CB(skb)->has_rxtstamp) {
21502310 tcp_update_recv_tstamps(skb, &tss);
2151
- has_tss = true;
2152
- has_cmsg = true;
2311
+ cmsg_flags |= 2;
21532312 }
21542313
21552314 if (used + offset < skb->len)
....@@ -2161,7 +2320,7 @@
21612320 sk_eat_skb(sk, skb);
21622321 continue;
21632322
2164
- found_fin_ok:
2323
+found_fin_ok:
21652324 /* Process the FIN. */
21662325 WRITE_ONCE(*seq, *seq + 1);
21672326 if (!(flags & MSG_PEEK))
....@@ -2169,6 +2328,7 @@
21692328 break;
21702329 } while (len > 0);
21712330
2331
+ trace_android_rvh_tcp_recvmsg_stat(sk, copied);
21722332 /* According to UNIX98, msg_name/msg_namelen are ignored
21732333 * on connected socket. I was just happy when found this 8) --ANK
21742334 */
....@@ -2178,10 +2338,10 @@
21782338
21792339 release_sock(sk);
21802340
2181
- if (has_cmsg) {
2182
- if (has_tss)
2341
+ if (cmsg_flags) {
2342
+ if (cmsg_flags & 2)
21832343 tcp_recv_timestamp(msg, sk, &tss);
2184
- if (tp->recvmsg_inq) {
2344
+ if (cmsg_flags & 1) {
21852345 inq = tcp_inq_hint(sk);
21862346 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
21872347 }
....@@ -2245,7 +2405,7 @@
22452405 if (inet_csk(sk)->icsk_bind_hash &&
22462406 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
22472407 inet_put_port(sk);
2248
- /* fall through */
2408
+ fallthrough;
22492409 default:
22502410 if (oldstate == TCP_ESTABLISHED)
22512411 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
....@@ -2255,10 +2415,6 @@
22552415 * socket sitting in hash tables.
22562416 */
22572417 inet_sk_state_store(sk, state);
2258
-
2259
-#ifdef STATE_TRACE
2260
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2261
-#endif
22622418 }
22632419 EXPORT_SYMBOL_GPL(tcp_set_state);
22642420
....@@ -2335,14 +2491,13 @@
23352491 return too_many_orphans || out_of_socket_memory;
23362492 }
23372493
2338
-void tcp_close(struct sock *sk, long timeout)
2494
+void __tcp_close(struct sock *sk, long timeout)
23392495 {
23402496 struct sk_buff *skb;
23412497 int data_was_unread = 0;
23422498 int state;
23432499
2344
- lock_sock(sk);
2345
- sk->sk_shutdown = SHUTDOWN_MASK;
2500
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
23462501
23472502 if (sk->sk_state == TCP_LISTEN) {
23482503 tcp_set_state(sk, TCP_CLOSE);
....@@ -2488,7 +2643,10 @@
24882643 }
24892644
24902645 if (sk->sk_state == TCP_CLOSE) {
2491
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2646
+ struct request_sock *req;
2647
+
2648
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2649
+ lockdep_sock_is_held(sk));
24922650 /* We could get here with a non-NULL req if the socket is
24932651 * aborted (e.g., closed with unread data) before 3WHS
24942652 * finishes.
....@@ -2502,6 +2660,12 @@
25022660 out:
25032661 bh_unlock_sock(sk);
25042662 local_bh_enable();
2663
+}
2664
+
2665
+void tcp_close(struct sock *sk, long timeout)
2666
+{
2667
+ lock_sock(sk);
2668
+ __tcp_close(sk, timeout);
25052669 release_sock(sk);
25062670 sock_put(sk);
25072671 }
....@@ -2543,6 +2707,11 @@
25432707 sk_wmem_free_skb(sk, skb);
25442708 }
25452709 tcp_rtx_queue_purge(sk);
2710
+ skb = sk->sk_tx_skb_cache;
2711
+ if (skb) {
2712
+ __kfree_skb(skb);
2713
+ sk->sk_tx_skb_cache = NULL;
2714
+ }
25462715 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
25472716 sk_mem_reclaim(sk);
25482717 tcp_clear_all_retrans_hints(tcp_sk(sk));
....@@ -2579,6 +2748,10 @@
25792748
25802749 tcp_clear_xmit_timers(sk);
25812750 __skb_queue_purge(&sk->sk_receive_queue);
2751
+ if (sk->sk_rx_skb_cache) {
2752
+ __kfree_skb(sk->sk_rx_skb_cache);
2753
+ sk->sk_rx_skb_cache = NULL;
2754
+ }
25822755 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
25832756 tp->urg_data = 0;
25842757 tcp_write_queue_purge(sk);
....@@ -2590,9 +2763,10 @@
25902763 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
25912764 inet_reset_saddr(sk);
25922765
2593
- sk->sk_shutdown = 0;
2766
+ WRITE_ONCE(sk->sk_shutdown, 0);
25942767 sock_reset_flag(sk, SOCK_DONE);
25952768 tp->srtt_us = 0;
2769
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
25962770 tp->rcv_rtt_last_tsecr = 0;
25972771
25982772 seq = tp->write_seq + tp->max_window + 2;
....@@ -2600,16 +2774,24 @@
26002774 seq = 1;
26012775 WRITE_ONCE(tp->write_seq, seq);
26022776
2603
- tp->snd_cwnd = 2;
2777
+ icsk->icsk_backoff = 0;
26042778 icsk->icsk_probes_out = 0;
2779
+ icsk->icsk_probes_tstamp = 0;
2780
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
2781
+ icsk->icsk_rto_min = TCP_RTO_MIN;
2782
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
26052783 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2784
+ tp->snd_cwnd = TCP_INIT_CWND;
26062785 tp->snd_cwnd_cnt = 0;
2786
+ tp->is_cwnd_limited = 0;
2787
+ tp->max_packets_out = 0;
26072788 tp->window_clamp = 0;
26082789 tp->delivered = 0;
26092790 tp->delivered_ce = 0;
26102791 if (icsk->icsk_ca_ops->release)
26112792 icsk->icsk_ca_ops->release(sk);
26122793 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
2794
+ icsk->icsk_ca_initialized = 0;
26132795 tcp_set_ca_state(sk, TCP_CA_Open);
26142796 tp->is_sack_reneg = 0;
26152797 tcp_clear_retrans(tp);
....@@ -2621,8 +2803,7 @@
26212803 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
26222804 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
26232805 __sk_dst_reset(sk);
2624
- dst_release(sk->sk_rx_dst);
2625
- sk->sk_rx_dst = NULL;
2806
+ dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
26262807 tcp_saved_syn_free(tp);
26272808 tp->compressed_ack = 0;
26282809 tp->segs_in = 0;
....@@ -2633,12 +2814,34 @@
26332814 tp->bytes_retrans = 0;
26342815 tp->data_segs_in = 0;
26352816 tp->data_segs_out = 0;
2817
+ tp->duplicate_sack[0].start_seq = 0;
2818
+ tp->duplicate_sack[0].end_seq = 0;
26362819 tp->dsack_dups = 0;
26372820 tp->reord_seen = 0;
2821
+ tp->retrans_out = 0;
2822
+ tp->sacked_out = 0;
2823
+ tp->tlp_high_seq = 0;
2824
+ tp->last_oow_ack_time = 0;
2825
+ /* There's a bubble in the pipe until at least the first ACK. */
2826
+ tp->app_limited = ~0U;
2827
+ tp->rate_app_limited = 1;
2828
+ tp->rack.mstamp = 0;
2829
+ tp->rack.advanced = 0;
2830
+ tp->rack.reo_wnd_steps = 1;
2831
+ tp->rack.last_delivered = 0;
2832
+ tp->rack.reo_wnd_persist = 0;
2833
+ tp->rack.dsack_seen = 0;
2834
+ tp->syn_data_acked = 0;
2835
+ tp->rx_opt.saw_tstamp = 0;
2836
+ tp->rx_opt.dsack = 0;
2837
+ tp->rx_opt.num_sacks = 0;
2838
+ tp->rcv_ooopack = 0;
2839
+
26382840
26392841 /* Clean up fastopen related fields */
26402842 tcp_free_fastopen_req(tp);
26412843 inet->defer_connect = 0;
2844
+ tp->fastopen_client_fail = 0;
26422845
26432846 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
26442847
....@@ -2659,7 +2862,7 @@
26592862 (sk->sk_state != TCP_LISTEN);
26602863 }
26612864
2662
-static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2865
+static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
26632866 {
26642867 struct tcp_repair_window opt;
26652868
....@@ -2669,7 +2872,7 @@
26692872 if (len != sizeof(opt))
26702873 return -EINVAL;
26712874
2672
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
2875
+ if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
26732876 return -EFAULT;
26742877
26752878 if (opt.max_window < opt.snd_wnd)
....@@ -2691,17 +2894,18 @@
26912894 return 0;
26922895 }
26932896
2694
-static int tcp_repair_options_est(struct sock *sk,
2695
- struct tcp_repair_opt __user *optbuf, unsigned int len)
2897
+static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
2898
+ unsigned int len)
26962899 {
26972900 struct tcp_sock *tp = tcp_sk(sk);
26982901 struct tcp_repair_opt opt;
2902
+ size_t offset = 0;
26992903
27002904 while (len >= sizeof(opt)) {
2701
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
2905
+ if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
27022906 return -EFAULT;
27032907
2704
- optbuf++;
2908
+ offset += sizeof(opt);
27052909 len -= sizeof(opt);
27062910
27072911 switch (opt.opt_code) {
....@@ -2740,11 +2944,185 @@
27402944 return 0;
27412945 }
27422946
2947
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2948
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
2949
+
2950
+static void tcp_enable_tx_delay(void)
2951
+{
2952
+ if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
2953
+ static int __tcp_tx_delay_enabled = 0;
2954
+
2955
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
2956
+ static_branch_enable(&tcp_tx_delay_enabled);
2957
+ pr_info("TCP_TX_DELAY enabled\n");
2958
+ }
2959
+ }
2960
+}
2961
+
2962
+/* When set indicates to always queue non-full frames. Later the user clears
2963
+ * this option and we transmit any pending partial frames in the queue. This is
2964
+ * meant to be used alongside sendfile() to get properly filled frames when the
2965
+ * user (for example) must write out headers with a write() call first and then
2966
+ * use sendfile to send out the data parts.
2967
+ *
2968
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
2969
+ * TCP_NODELAY.
2970
+ */
2971
+static void __tcp_sock_set_cork(struct sock *sk, bool on)
2972
+{
2973
+ struct tcp_sock *tp = tcp_sk(sk);
2974
+
2975
+ if (on) {
2976
+ tp->nonagle |= TCP_NAGLE_CORK;
2977
+ } else {
2978
+ tp->nonagle &= ~TCP_NAGLE_CORK;
2979
+ if (tp->nonagle & TCP_NAGLE_OFF)
2980
+ tp->nonagle |= TCP_NAGLE_PUSH;
2981
+ tcp_push_pending_frames(sk);
2982
+ }
2983
+}
2984
+
2985
+void tcp_sock_set_cork(struct sock *sk, bool on)
2986
+{
2987
+ lock_sock(sk);
2988
+ __tcp_sock_set_cork(sk, on);
2989
+ release_sock(sk);
2990
+}
2991
+EXPORT_SYMBOL(tcp_sock_set_cork);
2992
+
2993
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
2994
+ * remembered, but it is not activated until cork is cleared.
2995
+ *
2996
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
2997
+ * even TCP_CORK for currently queued segments.
2998
+ */
2999
+static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3000
+{
3001
+ if (on) {
3002
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
3003
+ tcp_push_pending_frames(sk);
3004
+ } else {
3005
+ tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3006
+ }
3007
+}
3008
+
3009
+void tcp_sock_set_nodelay(struct sock *sk)
3010
+{
3011
+ lock_sock(sk);
3012
+ __tcp_sock_set_nodelay(sk, true);
3013
+ release_sock(sk);
3014
+}
3015
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
3016
+
3017
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
3018
+{
3019
+ if (!val) {
3020
+ inet_csk_enter_pingpong_mode(sk);
3021
+ return;
3022
+ }
3023
+
3024
+ inet_csk_exit_pingpong_mode(sk);
3025
+ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3026
+ inet_csk_ack_scheduled(sk)) {
3027
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3028
+ tcp_cleanup_rbuf(sk, 1);
3029
+ if (!(val & 1))
3030
+ inet_csk_enter_pingpong_mode(sk);
3031
+ }
3032
+}
3033
+
3034
+void tcp_sock_set_quickack(struct sock *sk, int val)
3035
+{
3036
+ lock_sock(sk);
3037
+ __tcp_sock_set_quickack(sk, val);
3038
+ release_sock(sk);
3039
+}
3040
+EXPORT_SYMBOL(tcp_sock_set_quickack);
3041
+
3042
+int tcp_sock_set_syncnt(struct sock *sk, int val)
3043
+{
3044
+ if (val < 1 || val > MAX_TCP_SYNCNT)
3045
+ return -EINVAL;
3046
+
3047
+ lock_sock(sk);
3048
+ inet_csk(sk)->icsk_syn_retries = val;
3049
+ release_sock(sk);
3050
+ return 0;
3051
+}
3052
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
3053
+
3054
+void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3055
+{
3056
+ lock_sock(sk);
3057
+ WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
3058
+ release_sock(sk);
3059
+}
3060
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3061
+
3062
+int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3063
+{
3064
+ struct tcp_sock *tp = tcp_sk(sk);
3065
+
3066
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
3067
+ return -EINVAL;
3068
+
3069
+ /* Paired with WRITE_ONCE() in keepalive_time_when() */
3070
+ WRITE_ONCE(tp->keepalive_time, val * HZ);
3071
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
3072
+ !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3073
+ u32 elapsed = keepalive_time_elapsed(tp);
3074
+
3075
+ if (tp->keepalive_time > elapsed)
3076
+ elapsed = tp->keepalive_time - elapsed;
3077
+ else
3078
+ elapsed = 0;
3079
+ inet_csk_reset_keepalive_timer(sk, elapsed);
3080
+ }
3081
+
3082
+ return 0;
3083
+}
3084
+
3085
+int tcp_sock_set_keepidle(struct sock *sk, int val)
3086
+{
3087
+ int err;
3088
+
3089
+ lock_sock(sk);
3090
+ err = tcp_sock_set_keepidle_locked(sk, val);
3091
+ release_sock(sk);
3092
+ return err;
3093
+}
3094
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
3095
+
3096
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
3097
+{
3098
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
3099
+ return -EINVAL;
3100
+
3101
+ lock_sock(sk);
3102
+ WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
3103
+ release_sock(sk);
3104
+ return 0;
3105
+}
3106
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3107
+
3108
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
3109
+{
3110
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
3111
+ return -EINVAL;
3112
+
3113
+ lock_sock(sk);
3114
+ /* Paired with READ_ONCE() in keepalive_probes() */
3115
+ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
3116
+ release_sock(sk);
3117
+ return 0;
3118
+}
3119
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3120
+
27433121 /*
27443122 * Socket option code for TCP.
27453123 */
2746
-static int do_tcp_setsockopt(struct sock *sk, int level,
2747
- int optname, char __user *optval, unsigned int optlen)
3124
+static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3125
+ sockptr_t optval, unsigned int optlen)
27483126 {
27493127 struct tcp_sock *tp = tcp_sk(sk);
27503128 struct inet_connection_sock *icsk = inet_csk(sk);
....@@ -2760,14 +3138,14 @@
27603138 if (optlen < 1)
27613139 return -EINVAL;
27623140
2763
- val = strncpy_from_user(name, optval,
3141
+ val = strncpy_from_sockptr(name, optval,
27643142 min_t(long, TCP_CA_NAME_MAX-1, optlen));
27653143 if (val < 0)
27663144 return -EFAULT;
27673145 name[val] = 0;
27683146
27693147 lock_sock(sk);
2770
- err = tcp_set_congestion_control(sk, name, true, true,
3148
+ err = tcp_set_congestion_control(sk, name, true,
27713149 ns_capable(sock_net(sk)->user_ns,
27723150 CAP_NET_ADMIN));
27733151 release_sock(sk);
....@@ -2779,7 +3157,7 @@
27793157 if (optlen < 1)
27803158 return -EINVAL;
27813159
2782
- val = strncpy_from_user(name, optval,
3160
+ val = strncpy_from_sockptr(name, optval,
27833161 min_t(long, TCP_ULP_NAME_MAX - 1,
27843162 optlen));
27853163 if (val < 0)
....@@ -2792,15 +3170,23 @@
27923170 return err;
27933171 }
27943172 case TCP_FASTOPEN_KEY: {
2795
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
3173
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3174
+ __u8 *backup_key = NULL;
27963175
2797
- if (optlen != sizeof(key))
3176
+ /* Allow a backup key as well to facilitate key rotation
3177
+ * First key is the active one.
3178
+ */
3179
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3180
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
27983181 return -EINVAL;
27993182
2800
- if (copy_from_user(key, optval, optlen))
3183
+ if (copy_from_sockptr(key, optval, optlen))
28013184 return -EFAULT;
28023185
2803
- return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
3186
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3187
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3188
+
3189
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
28043190 }
28053191 default:
28063192 /* fallthru */
....@@ -2810,7 +3196,7 @@
28103196 if (optlen < sizeof(int))
28113197 return -EINVAL;
28123198
2813
- if (get_user(val, (int __user *)optval))
3199
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
28143200 return -EFAULT;
28153201
28163202 lock_sock(sk);
....@@ -2829,20 +3215,7 @@
28293215 break;
28303216
28313217 case TCP_NODELAY:
2832
- if (val) {
2833
- /* TCP_NODELAY is weaker than TCP_CORK, so that
2834
- * this option on corked socket is remembered, but
2835
- * it is not activated until cork is cleared.
2836
- *
2837
- * However, when TCP_NODELAY is set we make
2838
- * an explicit push, which overrides even TCP_CORK
2839
- * for currently queued segments.
2840
- */
2841
- tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2842
- tcp_push_pending_frames(sk);
2843
- } else {
2844
- tp->nonagle &= ~TCP_NAGLE_OFF;
2845
- }
3218
+ __tcp_sock_set_nodelay(sk, val);
28463219 break;
28473220
28483221 case TCP_THIN_LINEAR_TIMEOUTS:
....@@ -2908,64 +3281,30 @@
29083281 case TCP_REPAIR_OPTIONS:
29093282 if (!tp->repair)
29103283 err = -EINVAL;
2911
- else if (sk->sk_state == TCP_ESTABLISHED)
2912
- err = tcp_repair_options_est(sk,
2913
- (struct tcp_repair_opt __user *)optval,
2914
- optlen);
3284
+ else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
3285
+ err = tcp_repair_options_est(sk, optval, optlen);
29153286 else
29163287 err = -EPERM;
29173288 break;
29183289
29193290 case TCP_CORK:
2920
- /* When set indicates to always queue non-full frames.
2921
- * Later the user clears this option and we transmit
2922
- * any pending partial frames in the queue. This is
2923
- * meant to be used alongside sendfile() to get properly
2924
- * filled frames when the user (for example) must write
2925
- * out headers with a write() call first and then use
2926
- * sendfile to send out the data parts.
2927
- *
2928
- * TCP_CORK can be set together with TCP_NODELAY and it is
2929
- * stronger than TCP_NODELAY.
2930
- */
2931
- if (val) {
2932
- tp->nonagle |= TCP_NAGLE_CORK;
2933
- } else {
2934
- tp->nonagle &= ~TCP_NAGLE_CORK;
2935
- if (tp->nonagle&TCP_NAGLE_OFF)
2936
- tp->nonagle |= TCP_NAGLE_PUSH;
2937
- tcp_push_pending_frames(sk);
2938
- }
3291
+ __tcp_sock_set_cork(sk, val);
29393292 break;
29403293
29413294 case TCP_KEEPIDLE:
2942
- if (val < 1 || val > MAX_TCP_KEEPIDLE)
2943
- err = -EINVAL;
2944
- else {
2945
- tp->keepalive_time = val * HZ;
2946
- if (sock_flag(sk, SOCK_KEEPOPEN) &&
2947
- !((1 << sk->sk_state) &
2948
- (TCPF_CLOSE | TCPF_LISTEN))) {
2949
- u32 elapsed = keepalive_time_elapsed(tp);
2950
- if (tp->keepalive_time > elapsed)
2951
- elapsed = tp->keepalive_time - elapsed;
2952
- else
2953
- elapsed = 0;
2954
- inet_csk_reset_keepalive_timer(sk, elapsed);
2955
- }
2956
- }
3295
+ err = tcp_sock_set_keepidle_locked(sk, val);
29573296 break;
29583297 case TCP_KEEPINTVL:
29593298 if (val < 1 || val > MAX_TCP_KEEPINTVL)
29603299 err = -EINVAL;
29613300 else
2962
- tp->keepalive_intvl = val * HZ;
3301
+ WRITE_ONCE(tp->keepalive_intvl, val * HZ);
29633302 break;
29643303 case TCP_KEEPCNT:
29653304 if (val < 1 || val > MAX_TCP_KEEPCNT)
29663305 err = -EINVAL;
29673306 else
2968
- tp->keepalive_probes = val;
3307
+ WRITE_ONCE(tp->keepalive_probes, val);
29693308 break;
29703309 case TCP_SYNCNT:
29713310 if (val < 1 || val > MAX_TCP_SYNCNT)
....@@ -2975,7 +3314,8 @@
29753314 break;
29763315
29773316 case TCP_SAVE_SYN:
2978
- if (val < 0 || val > 1)
3317
+ /* 0: disable, 1: enable, 2: start from ether_header */
3318
+ if (val < 0 || val > 2)
29793319 err = -EINVAL;
29803320 else
29813321 tp->save_syn = val;
....@@ -2983,18 +3323,18 @@
29833323
29843324 case TCP_LINGER2:
29853325 if (val < 0)
2986
- tp->linger2 = -1;
2987
- else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
2988
- tp->linger2 = 0;
3326
+ WRITE_ONCE(tp->linger2, -1);
3327
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3328
+ WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
29893329 else
2990
- tp->linger2 = val * HZ;
3330
+ WRITE_ONCE(tp->linger2, val * HZ);
29913331 break;
29923332
29933333 case TCP_DEFER_ACCEPT:
29943334 /* Translate value in seconds to number of retransmits */
2995
- icsk->icsk_accept_queue.rskq_defer_accept =
2996
- secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2997
- TCP_RTO_MAX / HZ);
3335
+ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
3336
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3337
+ TCP_RTO_MAX / HZ));
29983338 break;
29993339
30003340 case TCP_WINDOW_CLAMP:
....@@ -3010,19 +3350,7 @@
30103350 break;
30113351
30123352 case TCP_QUICKACK:
3013
- if (!val) {
3014
- icsk->icsk_ack.pingpong = 1;
3015
- } else {
3016
- icsk->icsk_ack.pingpong = 0;
3017
- if ((1 << sk->sk_state) &
3018
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3019
- inet_csk_ack_scheduled(sk)) {
3020
- icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
3021
- tcp_cleanup_rbuf(sk, 1);
3022
- if (!(val & 1))
3023
- icsk->icsk_ack.pingpong = 1;
3024
- }
3025
- }
3353
+ __tcp_sock_set_quickack(sk, val);
30263354 break;
30273355
30283356 #ifdef CONFIG_TCP_MD5SIG
....@@ -3038,7 +3366,7 @@
30383366 if (val < 0)
30393367 err = -EINVAL;
30403368 else
3041
- icsk->icsk_user_timeout = val;
3369
+ WRITE_ONCE(icsk->icsk_user_timeout, val);
30423370 break;
30433371
30443372 case TCP_FASTOPEN:
....@@ -3054,7 +3382,8 @@
30543382 case TCP_FASTOPEN_CONNECT:
30553383 if (val > 1 || val < 0) {
30563384 err = -EINVAL;
3057
- } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3385
+ } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
3386
+ TFO_CLIENT_ENABLE) {
30583387 if (sk->sk_state == TCP_CLOSE)
30593388 tp->fastopen_connect = val;
30603389 else
....@@ -3081,7 +3410,7 @@
30813410 err = tcp_repair_set_window(tp, optval, optlen);
30823411 break;
30833412 case TCP_NOTSENT_LOWAT:
3084
- tp->notsent_lowat = val;
3413
+ WRITE_ONCE(tp->notsent_lowat, val);
30853414 sk->sk_write_space(sk);
30863415 break;
30873416 case TCP_INQ:
....@@ -3089,6 +3418,11 @@
30893418 err = -EINVAL;
30903419 else
30913420 tp->recvmsg_inq = val;
3421
+ break;
3422
+ case TCP_TX_DELAY:
3423
+ if (val)
3424
+ tcp_enable_tx_delay();
3425
+ WRITE_ONCE(tp->tcp_tx_delay, val);
30923426 break;
30933427 default:
30943428 err = -ENOPROTOOPT;
....@@ -3099,7 +3433,7 @@
30993433 return err;
31003434 }
31013435
3102
-int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
3436
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
31033437 unsigned int optlen)
31043438 {
31053439 const struct inet_connection_sock *icsk = inet_csk(sk);
....@@ -3110,18 +3444,6 @@
31103444 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
31113445 }
31123446 EXPORT_SYMBOL(tcp_setsockopt);
3113
-
3114
-#ifdef CONFIG_COMPAT
3115
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
3116
- char __user *optval, unsigned int optlen)
3117
-{
3118
- if (level != SOL_TCP)
3119
- return inet_csk_compat_setsockopt(sk, level, optname,
3120
- optval, optlen);
3121
- return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3122
-}
3123
-EXPORT_SYMBOL(compat_tcp_setsockopt);
3124
-#endif
31253447
31263448 static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
31273449 struct tcp_info *info)
....@@ -3147,10 +3469,10 @@
31473469 {
31483470 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
31493471 const struct inet_connection_sock *icsk = inet_csk(sk);
3472
+ unsigned long rate;
31503473 u32 now;
31513474 u64 rate64;
31523475 bool slow;
3153
- u32 rate;
31543476
31553477 memset(info, 0, sizeof(*info));
31563478 if (sk->sk_type != SOCK_STREAM)
....@@ -3160,11 +3482,11 @@
31603482
31613483 /* Report meaningful fields for all TCP states, including listeners */
31623484 rate = READ_ONCE(sk->sk_pacing_rate);
3163
- rate64 = rate != ~0U ? rate : ~0ULL;
3485
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
31643486 info->tcpi_pacing_rate = rate64;
31653487
31663488 rate = READ_ONCE(sk->sk_max_pacing_rate);
3167
- rate64 = rate != ~0U ? rate : ~0ULL;
3489
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
31683490 info->tcpi_max_pacing_rate = rate64;
31693491
31703492 info->tcpi_reordering = tp->reordering;
....@@ -3175,8 +3497,8 @@
31753497 * tcpi_unacked -> Number of children ready for accept()
31763498 * tcpi_sacked -> max backlog
31773499 */
3178
- info->tcpi_unacked = sk->sk_ack_backlog;
3179
- info->tcpi_sacked = sk->sk_max_ack_backlog;
3500
+ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3501
+ info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
31803502 return;
31813503 }
31823504
....@@ -3254,6 +3576,9 @@
32543576 info->tcpi_bytes_retrans = tp->bytes_retrans;
32553577 info->tcpi_dsack_dups = tp->dsack_dups;
32563578 info->tcpi_reord_seen = tp->reord_seen;
3579
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3580
+ info->tcpi_snd_wnd = tp->snd_wnd;
3581
+ info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
32573582 unlock_sock_fast(sk, slow);
32583583 }
32593584 EXPORT_SYMBOL_GPL(tcp_get_info);
....@@ -3282,16 +3607,21 @@
32823607 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
32833608 nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
32843609 nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
3610
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
3611
+ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
3612
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
3613
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
32853614 0;
32863615 }
32873616
3288
-struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3617
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3618
+ const struct sk_buff *orig_skb)
32893619 {
32903620 const struct tcp_sock *tp = tcp_sk(sk);
32913621 struct sk_buff *stats;
32923622 struct tcp_info info;
3623
+ unsigned long rate;
32933624 u64 rate64;
3294
- u32 rate;
32953625
32963626 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
32973627 if (!stats)
....@@ -3310,7 +3640,7 @@
33103640 tp->total_retrans, TCP_NLA_PAD);
33113641
33123642 rate = READ_ONCE(sk->sk_pacing_rate);
3313
- rate64 = rate != ~0U ? rate : ~0ULL;
3643
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
33143644 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
33153645
33163646 rate64 = tcp_compute_delivery_rate(tp);
....@@ -3335,6 +3665,12 @@
33353665 TCP_NLA_PAD);
33363666 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
33373667 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3668
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3669
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3670
+ nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3671
+ max_t(int, 0, tp->write_seq - tp->snd_nxt));
3672
+ nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3673
+ TCP_NLA_PAD);
33383674
33393675 return stats;
33403676 }
....@@ -3358,7 +3694,8 @@
33583694 switch (optname) {
33593695 case TCP_MAXSEG:
33603696 val = tp->mss_cache;
3361
- if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3697
+ if (tp->rx_opt.user_mss &&
3698
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
33623699 val = tp->rx_opt.user_mss;
33633700 if (tp->repair)
33643701 val = tp->rx_opt.mss_clamp;
....@@ -3382,13 +3719,14 @@
33823719 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
33833720 break;
33843721 case TCP_LINGER2:
3385
- val = tp->linger2;
3722
+ val = READ_ONCE(tp->linger2);
33863723 if (val >= 0)
3387
- val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3724
+ val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
33883725 break;
33893726 case TCP_DEFER_ACCEPT:
3390
- val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3391
- TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3727
+ val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
3728
+ val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
3729
+ TCP_RTO_MAX / HZ);
33923730 break;
33933731 case TCP_WINDOW_CLAMP:
33943732 val = tp->window_clamp;
....@@ -3429,7 +3767,7 @@
34293767 return 0;
34303768 }
34313769 case TCP_QUICKACK:
3432
- val = !icsk->icsk_ack.pingpong;
3770
+ val = !inet_csk_in_pingpong_mode(sk);
34333771 break;
34343772
34353773 case TCP_CONGESTION:
....@@ -3458,21 +3796,15 @@
34583796 return 0;
34593797
34603798 case TCP_FASTOPEN_KEY: {
3461
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
3462
- struct tcp_fastopen_context *ctx;
3799
+ u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
3800
+ unsigned int key_len;
34633801
34643802 if (get_user(len, optlen))
34653803 return -EFAULT;
34663804
3467
- rcu_read_lock();
3468
- ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
3469
- if (ctx)
3470
- memcpy(key, ctx->key, sizeof(key));
3471
- else
3472
- len = 0;
3473
- rcu_read_unlock();
3474
-
3475
- len = min_t(unsigned int, len, sizeof(key));
3805
+ key_len = tcp_fastopen_get_cipher(net, icsk, key) *
3806
+ TCP_FASTOPEN_KEY_LENGTH;
3807
+ len = min_t(unsigned int, len, key_len);
34763808 if (put_user(len, optlen))
34773809 return -EFAULT;
34783810 if (copy_to_user(optval, key, len))
....@@ -3530,11 +3862,11 @@
35303862 break;
35313863
35323864 case TCP_USER_TIMEOUT:
3533
- val = icsk->icsk_user_timeout;
3865
+ val = READ_ONCE(icsk->icsk_user_timeout);
35343866 break;
35353867
35363868 case TCP_FASTOPEN:
3537
- val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3869
+ val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
35383870 break;
35393871
35403872 case TCP_FASTOPEN_CONNECT:
....@@ -3545,11 +3877,15 @@
35453877 val = tp->fastopen_no_cookie;
35463878 break;
35473879
3880
+ case TCP_TX_DELAY:
3881
+ val = READ_ONCE(tp->tcp_tx_delay);
3882
+ break;
3883
+
35483884 case TCP_TIMESTAMP:
35493885 val = tcp_time_stamp_raw() + tp->tsoffset;
35503886 break;
35513887 case TCP_NOTSENT_LOWAT:
3552
- val = tp->notsent_lowat;
3888
+ val = READ_ONCE(tp->notsent_lowat);
35533889 break;
35543890 case TCP_INQ:
35553891 val = tp->recvmsg_inq;
....@@ -3563,20 +3899,21 @@
35633899
35643900 lock_sock(sk);
35653901 if (tp->saved_syn) {
3566
- if (len < tp->saved_syn[0]) {
3567
- if (put_user(tp->saved_syn[0], optlen)) {
3902
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
3903
+ if (put_user(tcp_saved_syn_len(tp->saved_syn),
3904
+ optlen)) {
35683905 release_sock(sk);
35693906 return -EFAULT;
35703907 }
35713908 release_sock(sk);
35723909 return -EINVAL;
35733910 }
3574
- len = tp->saved_syn[0];
3911
+ len = tcp_saved_syn_len(tp->saved_syn);
35753912 if (put_user(len, optlen)) {
35763913 release_sock(sk);
35773914 return -EFAULT;
35783915 }
3579
- if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3916
+ if (copy_to_user(optval, tp->saved_syn->data, len)) {
35803917 release_sock(sk);
35813918 return -EFAULT;
35823919 }
....@@ -3592,18 +3929,41 @@
35923929 }
35933930 #ifdef CONFIG_MMU
35943931 case TCP_ZEROCOPY_RECEIVE: {
3595
- struct tcp_zerocopy_receive zc;
3932
+ struct tcp_zerocopy_receive zc = {};
35963933 int err;
35973934
35983935 if (get_user(len, optlen))
35993936 return -EFAULT;
3600
- if (len != sizeof(zc))
3937
+ if (len < 0 ||
3938
+ len < offsetofend(struct tcp_zerocopy_receive, length))
36013939 return -EINVAL;
3940
+ if (len > sizeof(zc)) {
3941
+ len = sizeof(zc);
3942
+ if (put_user(len, optlen))
3943
+ return -EFAULT;
3944
+ }
36023945 if (copy_from_user(&zc, optval, len))
36033946 return -EFAULT;
36043947 lock_sock(sk);
36053948 err = tcp_zerocopy_receive(sk, &zc);
36063949 release_sock(sk);
3950
+ if (len >= offsetofend(struct tcp_zerocopy_receive, err))
3951
+ goto zerocopy_rcv_sk_err;
3952
+ switch (len) {
3953
+ case offsetofend(struct tcp_zerocopy_receive, err):
3954
+ goto zerocopy_rcv_sk_err;
3955
+ case offsetofend(struct tcp_zerocopy_receive, inq):
3956
+ goto zerocopy_rcv_inq;
3957
+ case offsetofend(struct tcp_zerocopy_receive, length):
3958
+ default:
3959
+ goto zerocopy_rcv_out;
3960
+ }
3961
+zerocopy_rcv_sk_err:
3962
+ if (!err)
3963
+ zc.err = sock_error(sk);
3964
+zerocopy_rcv_inq:
3965
+ zc.inq = tcp_inq_hint(sk);
3966
+zerocopy_rcv_out:
36073967 if (!err && copy_to_user(optval, &zc, len))
36083968 err = -EFAULT;
36093969 return err;
....@@ -3631,18 +3991,6 @@
36313991 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
36323992 }
36333993 EXPORT_SYMBOL(tcp_getsockopt);
3634
-
3635
-#ifdef CONFIG_COMPAT
3636
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3637
- char __user *optval, int __user *optlen)
3638
-{
3639
- if (level != SOL_TCP)
3640
- return inet_csk_compat_getsockopt(sk, level, optname,
3641
- optval, optlen);
3642
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3643
-}
3644
-EXPORT_SYMBOL(compat_tcp_getsockopt);
3645
-#endif
36463994
36473995 #ifdef CONFIG_TCP_MD5SIG
36483996 static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
....@@ -3686,20 +4034,28 @@
36864034 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
36874035 */
36884036 smp_wmb();
3689
- tcp_md5sig_pool_populated = true;
4037
+ /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
4038
+ * and tcp_get_md5sig_pool().
4039
+ */
4040
+ WRITE_ONCE(tcp_md5sig_pool_populated, true);
36904041 }
36914042
36924043 bool tcp_alloc_md5sig_pool(void)
36934044 {
3694
- if (unlikely(!tcp_md5sig_pool_populated)) {
4045
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4046
+ if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
36954047 mutex_lock(&tcp_md5sig_mutex);
36964048
3697
- if (!tcp_md5sig_pool_populated)
4049
+ if (!tcp_md5sig_pool_populated) {
36984050 __tcp_alloc_md5sig_pool();
4051
+ if (tcp_md5sig_pool_populated)
4052
+ static_branch_inc(&tcp_md5_needed);
4053
+ }
36994054
37004055 mutex_unlock(&tcp_md5sig_mutex);
37014056 }
3702
- return tcp_md5sig_pool_populated;
4057
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4058
+ return READ_ONCE(tcp_md5sig_pool_populated);
37034059 }
37044060 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
37054061
....@@ -3715,7 +4071,8 @@
37154071 {
37164072 local_bh_disable();
37174073
3718
- if (tcp_md5sig_pool_populated) {
4074
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4075
+ if (READ_ONCE(tcp_md5sig_pool_populated)) {
37194076 /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
37204077 smp_rmb();
37214078 return this_cpu_ptr(&tcp_md5sig_pool);
....@@ -3745,8 +4102,8 @@
37454102 return 1;
37464103
37474104 for (i = 0; i < shi->nr_frags; ++i) {
3748
- const struct skb_frag_struct *f = &shi->frags[i];
3749
- unsigned int offset = f->page_offset;
4105
+ const skb_frag_t *f = &shi->frags[i];
4106
+ unsigned int offset = skb_frag_off(f);
37504107 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
37514108
37524109 sg_set_page(&sg, page, skb_frag_size(f),
....@@ -3772,8 +4129,8 @@
37724129 sg_init_one(&sg, key->key, keylen);
37734130 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
37744131
3775
- /* tcp_md5_do_add() might change key->key under us */
3776
- return crypto_ahash_update(hp->md5_req);
4132
+ /* We use data_race() because tcp_md5_do_add() might change key->key under us */
4133
+ return data_race(crypto_ahash_update(hp->md5_req));
37774134 }
37784135 EXPORT_SYMBOL(tcp_md5_hash_key);
37794136
....@@ -3781,7 +4138,13 @@
37814138
37824139 void tcp_done(struct sock *sk)
37834140 {
3784
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
4141
+ struct request_sock *req;
4142
+
4143
+ /* We might be called with a new socket, after
4144
+ * inet_csk_prepare_forced_close() has been called
4145
+ * so we can not use lockdep_sock_is_held(sk)
4146
+ */
4147
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
37854148
37864149 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
37874150 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
....@@ -3791,7 +4154,7 @@
37914154 if (req)
37924155 reqsk_fastopen_remove(sk, req, false);
37934156
3794
- sk->sk_shutdown = SHUTDOWN_MASK;
4157
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
37954158
37964159 if (!sock_flag(sk, SOCK_DEAD))
37974160 sk->sk_state_change(sk);
....@@ -3880,7 +4243,7 @@
38804243
38814244 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
38824245 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3883
- FIELD_SIZEOF(struct sk_buff, cb));
4246
+ sizeof_field(struct sk_buff, cb));
38844247
38854248 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
38864249 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
....@@ -3954,4 +4317,5 @@
39544317 tcp_metrics_init();
39554318 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
39564319 tcp_tasklet_init();
4320
+ mptcp_init();
39574321 }