forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/net/ipv4/tcp.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -205,11 +206,6 @@
205206 * Hirokazu Takahashi : Use copy_from_user() instead of
206207 * csum_and_copy_from_user() if possible.
207208 *
208
- * This program is free software; you can redistribute it and/or
209
- * modify it under the terms of the GNU General Public License
210
- * as published by the Free Software Foundation; either version
211
- * 2 of the License, or(at your option) any later version.
212
- *
213209 * Description of States:
214210 *
215211 * TCP_SYN_SENT sent a connection request, waiting for ack
....@@ -262,7 +258,7 @@
262258 #include <linux/net.h>
263259 #include <linux/socket.h>
264260 #include <linux/random.h>
265
-#include <linux/bootmem.h>
261
+#include <linux/memblock.h>
266262 #include <linux/highmem.h>
267263 #include <linux/swap.h>
268264 #include <linux/cache.h>
....@@ -275,6 +271,7 @@
275271 #include <net/icmp.h>
276272 #include <net/inet_common.h>
277273 #include <net/tcp.h>
274
+#include <net/mptcp.h>
278275 #include <net/xfrm.h>
279276 #include <net/ip.h>
280277 #include <net/sock.h>
....@@ -282,6 +279,8 @@
282279 #include <linux/uaccess.h>
283280 #include <asm/ioctls.h>
284281 #include <net/busy_poll.h>
282
+
283
+#include <trace/hooks/ipv4.h>
285284
286285 struct percpu_counter tcp_orphan_count;
287286 EXPORT_SYMBOL_GPL(tcp_orphan_count);
....@@ -320,6 +319,11 @@
320319 */
321320 unsigned long tcp_memory_pressure __read_mostly;
322321 EXPORT_SYMBOL_GPL(tcp_memory_pressure);
322
+
323
+DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
324
+EXPORT_SYMBOL(tcp_rx_skb_cache_key);
325
+
326
+DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
323327
324328 void tcp_enter_memory_pressure(struct sock *sk)
325329 {
....@@ -416,6 +420,8 @@
416420 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
417421
418422 icsk->icsk_rto = TCP_TIMEOUT_INIT;
423
+ icsk->icsk_rto_min = TCP_RTO_MIN;
424
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
419425 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
420426 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
421427
....@@ -436,38 +442,24 @@
436442 tp->snd_cwnd_clamp = ~0;
437443 tp->mss_cache = TCP_MSS_DEFAULT;
438444
439
- tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
445
+ tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
440446 tcp_assign_congestion_control(sk);
441447
442448 tp->tsoffset = 0;
443449 tp->rack.reo_wnd_steps = 1;
444
-
445
- sk->sk_state = TCP_CLOSE;
446450
447451 sk->sk_write_space = sk_stream_write_space;
448452 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
449453
450454 icsk->icsk_sync_mss = tcp_sync_mss;
451455
452
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
453
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
456
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
457
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
454458
455459 sk_sockets_allocated_inc(sk);
456460 sk->sk_route_forced_caps = NETIF_F_GSO;
457461 }
458462 EXPORT_SYMBOL(tcp_init_sock);
459
-
460
-void tcp_init_transfer(struct sock *sk, int bpf_op)
461
-{
462
- struct inet_connection_sock *icsk = inet_csk(sk);
463
-
464
- tcp_mtup_init(sk);
465
- icsk->icsk_af_ops->rebuild_header(sk);
466
- tcp_init_metrics(sk);
467
- tcp_call_bpf(sk, bpf_op, 0, NULL);
468
- tcp_init_congestion_control(sk);
469
- tcp_init_buffer_space(sk);
470
-}
471463
472464 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
473465 {
....@@ -564,10 +556,10 @@
564556
565557 /* Connected or passive Fast Open socket? */
566558 if (state != TCP_SYN_SENT &&
567
- (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
559
+ (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
568560 int target = sock_rcvlowat(sk, 0, INT_MAX);
569561
570
- if (tp->urg_seq == READ_ONCE(tp->copied_seq) &&
562
+ if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
571563 !sock_flag(sk, SOCK_URGINLINE) &&
572564 tp->urg_data)
573565 target++;
....@@ -576,7 +568,7 @@
576568 mask |= EPOLLIN | EPOLLRDNORM;
577569
578570 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
579
- if (sk_stream_is_writeable(sk)) {
571
+ if (__sk_stream_is_writeable(sk, 1)) {
580572 mask |= EPOLLOUT | EPOLLWRNORM;
581573 } else { /* send SIGIO later */
582574 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
....@@ -588,7 +580,7 @@
588580 * pairs with the input side.
589581 */
590582 smp_mb__after_atomic();
591
- if (sk_stream_is_writeable(sk))
583
+ if (__sk_stream_is_writeable(sk, 1))
592584 mask |= EPOLLOUT | EPOLLWRNORM;
593585 }
594586 } else
....@@ -628,7 +620,8 @@
628620 unlock_sock_fast(sk, slow);
629621 break;
630622 case SIOCATMARK:
631
- answ = tp->urg_data && tp->urg_seq == READ_ONCE(tp->copied_seq);
623
+ answ = tp->urg_data &&
624
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
632625 break;
633626 case SIOCOUTQ:
634627 if (sk->sk_state == TCP_LISTEN)
....@@ -646,7 +639,8 @@
646639 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
647640 answ = 0;
648641 else
649
- answ = READ_ONCE(tp->write_seq) - tp->snd_nxt;
642
+ answ = READ_ONCE(tp->write_seq) -
643
+ READ_ONCE(tp->snd_nxt);
650644 break;
651645 default:
652646 return -ENOIOCTLCMD;
....@@ -678,7 +672,7 @@
678672 tcb->sacked = 0;
679673 __skb_header_release(skb);
680674 tcp_add_write_queue_tail(sk, skb);
681
- sk->sk_wmem_queued += skb->truesize;
675
+ sk_wmem_queued_add(sk, skb->truesize);
682676 sk_mem_charge(sk, skb->truesize);
683677 if (tp->nonagle & TCP_NAGLE_PUSH)
684678 tp->nonagle &= ~TCP_NAGLE_PUSH;
....@@ -706,13 +700,13 @@
706700 int size_goal)
707701 {
708702 return skb->len < size_goal &&
709
- sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
703
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
710704 !tcp_rtx_queue_empty(sk) &&
711705 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
712706 }
713707
714
-static void tcp_push(struct sock *sk, int flags, int mss_now,
715
- int nonagle, int size_goal)
708
+void tcp_push(struct sock *sk, int flags, int mss_now,
709
+ int nonagle, int size_goal)
716710 {
717711 struct tcp_sock *tp = tcp_sk(sk);
718712 struct sk_buff *skb;
....@@ -875,6 +869,18 @@
875869 {
876870 struct sk_buff *skb;
877871
872
+ if (likely(!size)) {
873
+ skb = sk->sk_tx_skb_cache;
874
+ if (skb) {
875
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
876
+ sk->sk_tx_skb_cache = NULL;
877
+ pskb_trim(skb, 0);
878
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
879
+ skb_shinfo(skb)->tx_flags = 0;
880
+ memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
881
+ return skb;
882
+ }
883
+ }
878884 /* The TCP header must be at least 32-bit aligned. */
879885 size = ALIGN(size, 4);
880886
....@@ -934,7 +940,7 @@
934940 return max(size_goal, mss_now);
935941 }
936942
937
-static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
943
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
938944 {
939945 int mss_now;
940946
....@@ -969,6 +975,11 @@
969975 ssize_t copied;
970976 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
971977
978
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
979
+ WARN_ONCE(!sendpage_ok(page),
980
+ "page must not be a Slab one and have page_count > 0"))
981
+ return -EINVAL;
982
+
972983 /* Wait for a connection to finish. One exception is TCP Fast Open
973984 * (passive side) where data is allowed to be sent before a connection
974985 * is fully established.
....@@ -998,13 +1009,16 @@
9981009 !tcp_skb_can_collapse_to(skb)) {
9991010 new_segment:
10001011 if (!sk_stream_memory_free(sk))
1001
- goto wait_for_sndbuf;
1012
+ goto wait_for_space;
10021013
10031014 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
10041015 tcp_rtx_and_write_queues_empty(sk));
10051016 if (!skb)
1006
- goto wait_for_memory;
1017
+ goto wait_for_space;
10071018
1019
+#ifdef CONFIG_TLS_DEVICE
1020
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
1021
+#endif
10081022 skb_entail(sk, skb);
10091023 copy = size_goal;
10101024 }
....@@ -1019,7 +1033,7 @@
10191033 goto new_segment;
10201034 }
10211035 if (!sk_wmem_schedule(sk, copy))
1022
- goto wait_for_memory;
1036
+ goto wait_for_space;
10231037
10241038 if (can_coalesce) {
10251039 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
....@@ -1034,7 +1048,7 @@
10341048 skb->len += copy;
10351049 skb->data_len += copy;
10361050 skb->truesize += copy;
1037
- sk->sk_wmem_queued += copy;
1051
+ sk_wmem_queued_add(sk, copy);
10381052 sk_mem_charge(sk, copy);
10391053 skb->ip_summed = CHECKSUM_PARTIAL;
10401054 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
....@@ -1060,9 +1074,8 @@
10601074 tcp_push_one(sk, mss_now);
10611075 continue;
10621076
1063
-wait_for_sndbuf:
1077
+wait_for_space:
10641078 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1065
-wait_for_memory:
10661079 tcp_push(sk, flags & ~MSG_MORE, mss_now,
10671080 TCP_NAGLE_PUSH, size_goal);
10681081
....@@ -1120,30 +1133,6 @@
11201133 }
11211134 EXPORT_SYMBOL(tcp_sendpage);
11221135
1123
-/* Do not bother using a page frag for very small frames.
1124
- * But use this heuristic only for the first skb in write queue.
1125
- *
1126
- * Having no payload in skb->head allows better SACK shifting
1127
- * in tcp_shift_skb_data(), reducing sack/rack overhead, because
1128
- * write queue has less skbs.
1129
- * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
1130
- * This also speeds up tso_fragment(), since it wont fallback
1131
- * to tcp_fragment().
1132
- */
1133
-static int linear_payload_sz(bool first_skb)
1134
-{
1135
- if (first_skb)
1136
- return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1137
- return 0;
1138
-}
1139
-
1140
-static int select_size(bool first_skb, bool zc)
1141
-{
1142
- if (zc)
1143
- return 0;
1144
- return linear_payload_sz(first_skb);
1145
-}
1146
-
11471136 void tcp_free_fastopen_req(struct tcp_sock *tp)
11481137 {
11491138 if (tp->fastopen_req) {
....@@ -1153,14 +1142,16 @@
11531142 }
11541143
11551144 static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1156
- int *copied, size_t size)
1145
+ int *copied, size_t size,
1146
+ struct ubuf_info *uarg)
11571147 {
11581148 struct tcp_sock *tp = tcp_sk(sk);
11591149 struct inet_sock *inet = inet_sk(sk);
11601150 struct sockaddr *uaddr = msg->msg_name;
11611151 int err, flags;
11621152
1163
- if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1153
+ if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
1154
+ TFO_CLIENT_ENABLE) ||
11641155 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
11651156 uaddr->sa_family == AF_UNSPEC))
11661157 return -EOPNOTSUPP;
....@@ -1173,6 +1164,7 @@
11731164 return -ENOBUFS;
11741165 tp->fastopen_req->data = msg;
11751166 tp->fastopen_req->size = size;
1167
+ tp->fastopen_req->uarg = uarg;
11761168
11771169 if (inet->defer_connect) {
11781170 err = tcp_connect(sk);
....@@ -1205,18 +1197,14 @@
12051197 struct sockcm_cookie sockc;
12061198 int flags, err, copied = 0;
12071199 int mss_now = 0, size_goal, copied_syn = 0;
1208
- bool process_backlog = false;
1200
+ int process_backlog = 0;
12091201 bool zc = false;
12101202 long timeo;
12111203
1204
+ trace_android_rvh_tcp_sendmsg_locked(sk, size);
12121205 flags = msg->msg_flags;
12131206
12141207 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1215
- if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
1216
- err = -EINVAL;
1217
- goto out_err;
1218
- }
1219
-
12201208 skb = tcp_write_queue_tail(sk);
12211209 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
12221210 if (!uarg) {
....@@ -1231,7 +1219,7 @@
12311219
12321220 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
12331221 !tp->repair) {
1234
- err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1222
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
12351223 if (err == -EINPROGRESS && copied_syn > 0)
12361224 goto out;
12371225 else if (err)
....@@ -1297,31 +1285,30 @@
12971285
12981286 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
12991287 bool first_skb;
1300
- int linear;
13011288
13021289 new_segment:
13031290 if (!sk_stream_memory_free(sk))
1304
- goto wait_for_sndbuf;
1291
+ goto wait_for_space;
13051292
1306
- if (process_backlog && sk_flush_backlog(sk)) {
1307
- process_backlog = false;
1308
- goto restart;
1293
+ if (unlikely(process_backlog >= 16)) {
1294
+ process_backlog = 0;
1295
+ if (sk_flush_backlog(sk))
1296
+ goto restart;
13091297 }
13101298 first_skb = tcp_rtx_and_write_queues_empty(sk);
1311
- linear = select_size(first_skb, zc);
1312
- skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
1299
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
13131300 first_skb);
13141301 if (!skb)
1315
- goto wait_for_memory;
1302
+ goto wait_for_space;
13161303
1317
- process_backlog = true;
1304
+ process_backlog++;
13181305 skb->ip_summed = CHECKSUM_PARTIAL;
13191306
13201307 skb_entail(sk, skb);
13211308 copy = size_goal;
13221309
13231310 /* All packets are restored as if they have
1324
- * already been sent. skb_mstamp isn't set to
1311
+ * already been sent. skb_mstamp_ns isn't set to
13251312 * avoid wrong rtt estimation.
13261313 */
13271314 if (tp->repair)
....@@ -1345,7 +1332,7 @@
13451332 struct page_frag *pfrag = sk_page_frag(sk);
13461333
13471334 if (!sk_page_frag_refill(sk, pfrag))
1348
- goto wait_for_memory;
1335
+ goto wait_for_space;
13491336
13501337 if (!skb_can_coalesce(skb, i, pfrag->page,
13511338 pfrag->offset)) {
....@@ -1359,7 +1346,7 @@
13591346 copy = min_t(int, copy, pfrag->size - pfrag->offset);
13601347
13611348 if (!sk_wmem_schedule(sk, copy))
1362
- goto wait_for_memory;
1349
+ goto wait_for_space;
13631350
13641351 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
13651352 pfrag->page,
....@@ -1378,6 +1365,9 @@
13781365 }
13791366 pfrag->offset += copy;
13801367 } else {
1368
+ if (!sk_wmem_schedule(sk, copy))
1369
+ goto wait_for_space;
1370
+
13811371 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
13821372 if (err == -EMSGSIZE || err == -EEXIST) {
13831373 tcp_mark_push(tp, skb);
....@@ -1412,9 +1402,8 @@
14121402 tcp_push_one(sk, mss_now);
14131403 continue;
14141404
1415
-wait_for_sndbuf:
1405
+wait_for_space:
14161406 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1417
-wait_for_memory:
14181407 if (copied)
14191408 tcp_push(sk, flags & ~MSG_MORE, mss_now,
14201409 TCP_NAGLE_PUSH, size_goal);
....@@ -1443,7 +1432,7 @@
14431432 if (copied + copied_syn)
14441433 goto out;
14451434 out_err:
1446
- sock_zerocopy_put_abort(uarg);
1435
+ sock_zerocopy_put_abort(uarg, true);
14471436 err = sk_stream_error(sk, flags, err);
14481437 /* make sure we wake any epoll edge trigger waiter */
14491438 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
....@@ -1546,7 +1535,7 @@
15461535 * calculation of whether or not we must ACK for the sake of
15471536 * a window update.
15481537 */
1549
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1538
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
15501539 {
15511540 struct tcp_sock *tp = tcp_sk(sk);
15521541 bool time_to_ack = false;
....@@ -1559,10 +1548,8 @@
15591548
15601549 if (inet_csk_ack_scheduled(sk)) {
15611550 const struct inet_connection_sock *icsk = inet_csk(sk);
1562
- /* Delayed ACKs frequently hit locked sockets during bulk
1563
- * receive. */
1564
- if (icsk->icsk_ack.blocked ||
1565
- /* Once-per-two-segments ACK was not sent by tcp_input.c */
1551
+
1552
+ if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
15661553 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
15671554 /*
15681555 * If this read emptied read buffer, we send ACK, if
....@@ -1573,7 +1560,7 @@
15731560 (copied > 0 &&
15741561 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
15751562 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1576
- !icsk->icsk_ack.pingpong)) &&
1563
+ !inet_csk_in_pingpong_mode(sk))) &&
15771564 !atomic_read(&sk->sk_rmem_alloc)))
15781565 time_to_ack = true;
15791566 }
....@@ -1669,11 +1656,13 @@
16691656 if (!copied)
16701657 copied = used;
16711658 break;
1672
- } else if (used <= len) {
1673
- seq += used;
1674
- copied += used;
1675
- offset += used;
16761659 }
1660
+ if (WARN_ON_ONCE(used > len))
1661
+ used = len;
1662
+ seq += used;
1663
+ copied += used;
1664
+ offset += used;
1665
+
16771666 /* If recv_actor drops the lock (e.g. TCP splice
16781667 * receive) the skb pointer might be invalid when
16791668 * getting here: tcp_collapse might have deleted it
....@@ -1725,9 +1714,9 @@
17251714 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
17261715 cap = sk->sk_rcvbuf >> 1;
17271716 else
1728
- cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1717
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
17291718 val = min(val, cap);
1730
- sk->sk_rcvlowat = val ? : 1;
1719
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
17311720
17321721 /* Check if we need to signal EPOLLIN right now */
17331722 tcp_data_ready(sk);
....@@ -1737,7 +1726,7 @@
17371726
17381727 val <<= 1;
17391728 if (val > sk->sk_rcvbuf) {
1740
- sk->sk_rcvbuf = val;
1729
+ WRITE_ONCE(sk->sk_rcvbuf, val);
17411730 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
17421731 }
17431732 return 0;
....@@ -1755,7 +1744,7 @@
17551744 return -EPERM;
17561745 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
17571746
1758
- /* Instruct vm_insert_page() to not down_read(mmap_sem) */
1747
+ /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
17591748 vma->vm_flags |= VM_MIXEDMAP;
17601749
17611750 vma->vm_ops = &tcp_vm_ops;
....@@ -1763,16 +1752,126 @@
17631752 }
17641753 EXPORT_SYMBOL(tcp_mmap);
17651754
1755
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1756
+ u32 *offset_frag)
1757
+{
1758
+ skb_frag_t *frag;
1759
+
1760
+ if (unlikely(offset_skb >= skb->len))
1761
+ return NULL;
1762
+
1763
+ offset_skb -= skb_headlen(skb);
1764
+ if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1765
+ return NULL;
1766
+
1767
+ frag = skb_shinfo(skb)->frags;
1768
+ while (offset_skb) {
1769
+ if (skb_frag_size(frag) > offset_skb) {
1770
+ *offset_frag = offset_skb;
1771
+ return frag;
1772
+ }
1773
+ offset_skb -= skb_frag_size(frag);
1774
+ ++frag;
1775
+ }
1776
+ *offset_frag = 0;
1777
+ return frag;
1778
+}
1779
+
1780
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1781
+ struct sk_buff *skb, u32 copylen,
1782
+ u32 *offset, u32 *seq)
1783
+{
1784
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
1785
+ struct msghdr msg = {};
1786
+ struct iovec iov;
1787
+ int err;
1788
+
1789
+ if (copy_address != zc->copybuf_address)
1790
+ return -EINVAL;
1791
+
1792
+ err = import_single_range(READ, (void __user *)copy_address,
1793
+ copylen, &iov, &msg.msg_iter);
1794
+ if (err)
1795
+ return err;
1796
+ err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1797
+ if (err)
1798
+ return err;
1799
+ zc->recv_skip_hint -= copylen;
1800
+ *offset += copylen;
1801
+ *seq += copylen;
1802
+ return (__s32)copylen;
1803
+}
1804
+
1805
+static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
1806
+ struct sock *sk,
1807
+ struct sk_buff *skb,
1808
+ u32 *seq,
1809
+ s32 copybuf_len)
1810
+{
1811
+ u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1812
+
1813
+ if (!copylen)
1814
+ return 0;
1815
+ /* skb is null if inq < PAGE_SIZE. */
1816
+ if (skb)
1817
+ offset = *seq - TCP_SKB_CB(skb)->seq;
1818
+ else
1819
+ skb = tcp_recv_skb(sk, *seq, &offset);
1820
+
1821
+ zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1822
+ seq);
1823
+ return zc->copybuf_len < 0 ? 0 : copylen;
1824
+}
1825
+
1826
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1827
+ struct page **pages,
1828
+ unsigned long pages_to_map,
1829
+ unsigned long *insert_addr,
1830
+ u32 *length_with_pending,
1831
+ u32 *seq,
1832
+ struct tcp_zerocopy_receive *zc)
1833
+{
1834
+ unsigned long pages_remaining = pages_to_map;
1835
+ int bytes_mapped;
1836
+ int ret;
1837
+
1838
+ ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
1839
+ bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
1840
+ /* Even if vm_insert_pages fails, it may have partially succeeded in
1841
+ * mapping (some but not all of the pages).
1842
+ */
1843
+ *seq += bytes_mapped;
1844
+ *insert_addr += bytes_mapped;
1845
+ if (ret) {
1846
+ /* But if vm_insert_pages did fail, we have to unroll some state
1847
+ * we speculatively touched before.
1848
+ */
1849
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1850
+ *length_with_pending -= bytes_not_mapped;
1851
+ zc->recv_skip_hint += bytes_not_mapped;
1852
+ }
1853
+ return ret;
1854
+}
1855
+
17661856 static int tcp_zerocopy_receive(struct sock *sk,
17671857 struct tcp_zerocopy_receive *zc)
17681858 {
1859
+ u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
17691860 unsigned long address = (unsigned long)zc->address;
1861
+ s32 copybuf_len = zc->copybuf_len;
1862
+ struct tcp_sock *tp = tcp_sk(sk);
1863
+ #define PAGE_BATCH_SIZE 8
1864
+ struct page *pages[PAGE_BATCH_SIZE];
17701865 const skb_frag_t *frags = NULL;
1771
- u32 length = 0, seq, offset;
17721866 struct vm_area_struct *vma;
17731867 struct sk_buff *skb = NULL;
1774
- struct tcp_sock *tp;
1868
+ unsigned long pg_idx = 0;
1869
+ unsigned long curr_addr;
1870
+ u32 seq = tp->copied_seq;
1871
+ int inq = tcp_inq(sk);
17751872 int ret;
1873
+
1874
+ zc->copybuf_len = 0;
17761875
17771876 if (address & (PAGE_SIZE - 1) || address != zc->address)
17781877 return -EINVAL;
....@@ -1782,65 +1881,98 @@
17821881
17831882 sock_rps_record_flow(sk);
17841883
1785
- down_read(&current->mm->mmap_sem);
1884
+ mmap_read_lock(current->mm);
17861885
17871886 vma = find_vma(current->mm, address);
17881887 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
1789
- up_read(&current->mm->mmap_sem);
1888
+ mmap_read_unlock(current->mm);
17901889 return -EINVAL;
17911890 }
1792
- zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1793
-
1794
- tp = tcp_sk(sk);
1795
- seq = tp->copied_seq;
1796
- zc->length = min_t(u32, zc->length, tcp_inq(sk));
1797
- zc->length &= ~(PAGE_SIZE - 1);
1798
-
1799
- zap_page_range(vma, address, zc->length);
1800
-
1801
- zc->recv_skip_hint = 0;
1891
+ vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
1892
+ avail_len = min_t(u32, vma_len, inq);
1893
+ aligned_len = avail_len & ~(PAGE_SIZE - 1);
1894
+ if (aligned_len) {
1895
+ zap_page_range(vma, address, aligned_len);
1896
+ zc->length = aligned_len;
1897
+ zc->recv_skip_hint = 0;
1898
+ } else {
1899
+ zc->length = avail_len;
1900
+ zc->recv_skip_hint = avail_len;
1901
+ }
18021902 ret = 0;
1903
+ curr_addr = address;
18031904 while (length + PAGE_SIZE <= zc->length) {
18041905 if (zc->recv_skip_hint < PAGE_SIZE) {
1906
+ u32 offset_frag;
1907
+
1908
+ /* If we're here, finish the current batch. */
1909
+ if (pg_idx) {
1910
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
1911
+ pg_idx,
1912
+ &curr_addr,
1913
+ &length,
1914
+ &seq, zc);
1915
+ if (ret)
1916
+ goto out;
1917
+ pg_idx = 0;
1918
+ }
18051919 if (skb) {
1920
+ if (zc->recv_skip_hint > 0)
1921
+ break;
18061922 skb = skb->next;
18071923 offset = seq - TCP_SKB_CB(skb)->seq;
18081924 } else {
18091925 skb = tcp_recv_skb(sk, seq, &offset);
18101926 }
1811
-
18121927 zc->recv_skip_hint = skb->len - offset;
1813
- offset -= skb_headlen(skb);
1814
- if ((int)offset < 0 || skb_has_frag_list(skb))
1928
+ frags = skb_advance_to_frag(skb, offset, &offset_frag);
1929
+ if (!frags || offset_frag)
18151930 break;
1816
- frags = skb_shinfo(skb)->frags;
1817
- while (offset) {
1818
- if (frags->size > offset)
1819
- goto out;
1820
- offset -= frags->size;
1931
+ }
1932
+ if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
1933
+ int remaining = zc->recv_skip_hint;
1934
+
1935
+ while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
1936
+ skb_frag_off(frags))) {
1937
+ remaining -= skb_frag_size(frags);
18211938 frags++;
18221939 }
1940
+ zc->recv_skip_hint -= remaining;
1941
+ break;
18231942 }
1824
- if (frags->size != PAGE_SIZE || frags->page_offset)
1825
- break;
1826
- ret = vm_insert_page(vma, address + length,
1827
- skb_frag_page(frags));
1828
- if (ret)
1829
- break;
1943
+ pages[pg_idx] = skb_frag_page(frags);
1944
+ pg_idx++;
18301945 length += PAGE_SIZE;
1831
- seq += PAGE_SIZE;
18321946 zc->recv_skip_hint -= PAGE_SIZE;
18331947 frags++;
1948
+ if (pg_idx == PAGE_BATCH_SIZE) {
1949
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1950
+ &curr_addr, &length,
1951
+ &seq, zc);
1952
+ if (ret)
1953
+ goto out;
1954
+ pg_idx = 0;
1955
+ }
1956
+ }
1957
+ if (pg_idx) {
1958
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1959
+ &curr_addr, &length, &seq,
1960
+ zc);
18341961 }
18351962 out:
1836
- up_read(&current->mm->mmap_sem);
1837
- if (length) {
1963
+ mmap_read_unlock(current->mm);
1964
+ /* Try to copy straggler data. */
1965
+ if (!ret)
1966
+ copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
1967
+ copybuf_len);
1968
+
1969
+ if (length + copylen) {
18381970 WRITE_ONCE(tp->copied_seq, seq);
18391971 tcp_rcv_space_adjust(sk);
18401972
18411973 /* Clean up data we have read: This will do ACK frames. */
18421974 tcp_recv_skb(sk, seq, &offset);
1843
- tcp_cleanup_rbuf(sk, length);
1975
+ tcp_cleanup_rbuf(sk, length + copylen);
18441976 ret = 0;
18451977 if (length == zc->length)
18461978 zc->recv_skip_hint = 0;
....@@ -1854,57 +1986,82 @@
18541986 #endif
18551987
18561988 static void tcp_update_recv_tstamps(struct sk_buff *skb,
1857
- struct scm_timestamping *tss)
1989
+ struct scm_timestamping_internal *tss)
18581990 {
18591991 if (skb->tstamp)
1860
- tss->ts[0] = ktime_to_timespec(skb->tstamp);
1992
+ tss->ts[0] = ktime_to_timespec64(skb->tstamp);
18611993 else
1862
- tss->ts[0] = (struct timespec) {0};
1994
+ tss->ts[0] = (struct timespec64) {0};
18631995
18641996 if (skb_hwtstamps(skb)->hwtstamp)
1865
- tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
1997
+ tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
18661998 else
1867
- tss->ts[2] = (struct timespec) {0};
1999
+ tss->ts[2] = (struct timespec64) {0};
18682000 }
18692001
18702002 /* Similar to __sock_recv_timestamp, but does not require an skb */
18712003 static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1872
- struct scm_timestamping *tss)
2004
+ struct scm_timestamping_internal *tss)
18732005 {
1874
- struct timeval tv;
2006
+ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
18752007 bool has_timestamping = false;
18762008
18772009 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
18782010 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
18792011 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
1880
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
1881
- sizeof(tss->ts[0]), &tss->ts[0]);
2012
+ if (new_tstamp) {
2013
+ struct __kernel_timespec kts = {
2014
+ .tv_sec = tss->ts[0].tv_sec,
2015
+ .tv_nsec = tss->ts[0].tv_nsec,
2016
+ };
2017
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2018
+ sizeof(kts), &kts);
2019
+ } else {
2020
+ struct __kernel_old_timespec ts_old = {
2021
+ .tv_sec = tss->ts[0].tv_sec,
2022
+ .tv_nsec = tss->ts[0].tv_nsec,
2023
+ };
2024
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2025
+ sizeof(ts_old), &ts_old);
2026
+ }
18822027 } else {
1883
- tv.tv_sec = tss->ts[0].tv_sec;
1884
- tv.tv_usec = tss->ts[0].tv_nsec / 1000;
1885
-
1886
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
1887
- sizeof(tv), &tv);
2028
+ if (new_tstamp) {
2029
+ struct __kernel_sock_timeval stv = {
2030
+ .tv_sec = tss->ts[0].tv_sec,
2031
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
2032
+ };
2033
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2034
+ sizeof(stv), &stv);
2035
+ } else {
2036
+ struct __kernel_old_timeval tv = {
2037
+ .tv_sec = tss->ts[0].tv_sec,
2038
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
2039
+ };
2040
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2041
+ sizeof(tv), &tv);
2042
+ }
18882043 }
18892044 }
18902045
18912046 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
18922047 has_timestamping = true;
18932048 else
1894
- tss->ts[0] = (struct timespec) {0};
2049
+ tss->ts[0] = (struct timespec64) {0};
18952050 }
18962051
18972052 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
18982053 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
18992054 has_timestamping = true;
19002055 else
1901
- tss->ts[2] = (struct timespec) {0};
2056
+ tss->ts[2] = (struct timespec64) {0};
19022057 }
19032058
19042059 if (has_timestamping) {
1905
- tss->ts[1] = (struct timespec) {0};
1906
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
1907
- sizeof(*tss), tss);
2060
+ tss->ts[1] = (struct timespec64) {0};
2061
+ if (sock_flag(sk, SOCK_TSTAMP_NEW))
2062
+ put_cmsg_scm_timestamping64(msg, tss);
2063
+ else
2064
+ put_cmsg_scm_timestamping(msg, tss);
19082065 }
19092066 }
19102067
....@@ -1950,12 +2107,12 @@
19502107 long timeo;
19512108 struct sk_buff *skb, *last;
19522109 u32 urg_hole = 0;
1953
- struct scm_timestamping tss;
1954
- bool has_tss = false;
1955
- bool has_cmsg;
2110
+ struct scm_timestamping_internal tss;
2111
+ int cmsg_flags;
19562112
19572113 if (unlikely(flags & MSG_ERRQUEUE))
19582114 return inet_recv_error(sk, msg, len, addr_len);
2115
+ trace_android_rvh_tcp_recvmsg(sk);
19592116
19602117 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
19612118 (sk->sk_state == TCP_ESTABLISHED))
....@@ -1967,7 +2124,7 @@
19672124 if (sk->sk_state == TCP_LISTEN)
19682125 goto out;
19692126
1970
- has_cmsg = tp->recvmsg_inq;
2127
+ cmsg_flags = tp->recvmsg_inq ? 1 : 0;
19712128 timeo = sock_rcvtimeo(sk, nonblock);
19722129
19732130 /* Urgent data needs to be handled specially. */
....@@ -2100,7 +2257,7 @@
21002257 }
21012258 continue;
21022259
2103
- found_ok_skb:
2260
+found_ok_skb:
21042261 /* Ok so how much can we use? */
21052262 used = skb->len - offset;
21062263 if (len < used)
....@@ -2148,8 +2305,7 @@
21482305
21492306 if (TCP_SKB_CB(skb)->has_rxtstamp) {
21502307 tcp_update_recv_tstamps(skb, &tss);
2151
- has_tss = true;
2152
- has_cmsg = true;
2308
+ cmsg_flags |= 2;
21532309 }
21542310
21552311 if (used + offset < skb->len)
....@@ -2161,7 +2317,7 @@
21612317 sk_eat_skb(sk, skb);
21622318 continue;
21632319
2164
- found_fin_ok:
2320
+found_fin_ok:
21652321 /* Process the FIN. */
21662322 WRITE_ONCE(*seq, *seq + 1);
21672323 if (!(flags & MSG_PEEK))
....@@ -2169,6 +2325,7 @@
21692325 break;
21702326 } while (len > 0);
21712327
2328
+ trace_android_rvh_tcp_recvmsg_stat(sk, copied);
21722329 /* According to UNIX98, msg_name/msg_namelen are ignored
21732330 * on connected socket. I was just happy when found this 8) --ANK
21742331 */
....@@ -2178,10 +2335,10 @@
21782335
21792336 release_sock(sk);
21802337
2181
- if (has_cmsg) {
2182
- if (has_tss)
2338
+ if (cmsg_flags) {
2339
+ if (cmsg_flags & 2)
21832340 tcp_recv_timestamp(msg, sk, &tss);
2184
- if (tp->recvmsg_inq) {
2341
+ if (cmsg_flags & 1) {
21852342 inq = tcp_inq_hint(sk);
21862343 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
21872344 }
....@@ -2245,7 +2402,7 @@
22452402 if (inet_csk(sk)->icsk_bind_hash &&
22462403 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
22472404 inet_put_port(sk);
2248
- /* fall through */
2405
+ fallthrough;
22492406 default:
22502407 if (oldstate == TCP_ESTABLISHED)
22512408 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
....@@ -2255,10 +2412,6 @@
22552412 * socket sitting in hash tables.
22562413 */
22572414 inet_sk_state_store(sk, state);
2258
-
2259
-#ifdef STATE_TRACE
2260
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2261
-#endif
22622415 }
22632416 EXPORT_SYMBOL_GPL(tcp_set_state);
22642417
....@@ -2488,7 +2641,10 @@
24882641 }
24892642
24902643 if (sk->sk_state == TCP_CLOSE) {
2491
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2644
+ struct request_sock *req;
2645
+
2646
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2647
+ lockdep_sock_is_held(sk));
24922648 /* We could get here with a non-NULL req if the socket is
24932649 * aborted (e.g., closed with unread data) before 3WHS
24942650 * finishes.
....@@ -2543,6 +2699,11 @@
25432699 sk_wmem_free_skb(sk, skb);
25442700 }
25452701 tcp_rtx_queue_purge(sk);
2702
+ skb = sk->sk_tx_skb_cache;
2703
+ if (skb) {
2704
+ __kfree_skb(skb);
2705
+ sk->sk_tx_skb_cache = NULL;
2706
+ }
25462707 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
25472708 sk_mem_reclaim(sk);
25482709 tcp_clear_all_retrans_hints(tcp_sk(sk));
....@@ -2579,6 +2740,10 @@
25792740
25802741 tcp_clear_xmit_timers(sk);
25812742 __skb_queue_purge(&sk->sk_receive_queue);
2743
+ if (sk->sk_rx_skb_cache) {
2744
+ __kfree_skb(sk->sk_rx_skb_cache);
2745
+ sk->sk_rx_skb_cache = NULL;
2746
+ }
25822747 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
25832748 tp->urg_data = 0;
25842749 tcp_write_queue_purge(sk);
....@@ -2593,6 +2758,7 @@
25932758 sk->sk_shutdown = 0;
25942759 sock_reset_flag(sk, SOCK_DONE);
25952760 tp->srtt_us = 0;
2761
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
25962762 tp->rcv_rtt_last_tsecr = 0;
25972763
25982764 seq = tp->write_seq + tp->max_window + 2;
....@@ -2600,16 +2766,24 @@
26002766 seq = 1;
26012767 WRITE_ONCE(tp->write_seq, seq);
26022768
2603
- tp->snd_cwnd = 2;
2769
+ icsk->icsk_backoff = 0;
26042770 icsk->icsk_probes_out = 0;
2771
+ icsk->icsk_probes_tstamp = 0;
2772
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
2773
+ icsk->icsk_rto_min = TCP_RTO_MIN;
2774
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
26052775 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2776
+ tp->snd_cwnd = TCP_INIT_CWND;
26062777 tp->snd_cwnd_cnt = 0;
2778
+ tp->is_cwnd_limited = 0;
2779
+ tp->max_packets_out = 0;
26072780 tp->window_clamp = 0;
26082781 tp->delivered = 0;
26092782 tp->delivered_ce = 0;
26102783 if (icsk->icsk_ca_ops->release)
26112784 icsk->icsk_ca_ops->release(sk);
26122785 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
2786
+ icsk->icsk_ca_initialized = 0;
26132787 tcp_set_ca_state(sk, TCP_CA_Open);
26142788 tp->is_sack_reneg = 0;
26152789 tcp_clear_retrans(tp);
....@@ -2621,8 +2795,7 @@
26212795 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
26222796 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
26232797 __sk_dst_reset(sk);
2624
- dst_release(sk->sk_rx_dst);
2625
- sk->sk_rx_dst = NULL;
2798
+ dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
26262799 tcp_saved_syn_free(tp);
26272800 tp->compressed_ack = 0;
26282801 tp->segs_in = 0;
....@@ -2633,12 +2806,33 @@
26332806 tp->bytes_retrans = 0;
26342807 tp->data_segs_in = 0;
26352808 tp->data_segs_out = 0;
2809
+ tp->duplicate_sack[0].start_seq = 0;
2810
+ tp->duplicate_sack[0].end_seq = 0;
26362811 tp->dsack_dups = 0;
26372812 tp->reord_seen = 0;
2813
+ tp->retrans_out = 0;
2814
+ tp->sacked_out = 0;
2815
+ tp->tlp_high_seq = 0;
2816
+ tp->last_oow_ack_time = 0;
2817
+ /* There's a bubble in the pipe until at least the first ACK. */
2818
+ tp->app_limited = ~0U;
2819
+ tp->rack.mstamp = 0;
2820
+ tp->rack.advanced = 0;
2821
+ tp->rack.reo_wnd_steps = 1;
2822
+ tp->rack.last_delivered = 0;
2823
+ tp->rack.reo_wnd_persist = 0;
2824
+ tp->rack.dsack_seen = 0;
2825
+ tp->syn_data_acked = 0;
2826
+ tp->rx_opt.saw_tstamp = 0;
2827
+ tp->rx_opt.dsack = 0;
2828
+ tp->rx_opt.num_sacks = 0;
2829
+ tp->rcv_ooopack = 0;
2830
+
26382831
26392832 /* Clean up fastopen related fields */
26402833 tcp_free_fastopen_req(tp);
26412834 inet->defer_connect = 0;
2835
+ tp->fastopen_client_fail = 0;
26422836
26432837 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
26442838
....@@ -2659,7 +2853,7 @@
26592853 (sk->sk_state != TCP_LISTEN);
26602854 }
26612855
2662
-static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2856
+static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
26632857 {
26642858 struct tcp_repair_window opt;
26652859
....@@ -2669,7 +2863,7 @@
26692863 if (len != sizeof(opt))
26702864 return -EINVAL;
26712865
2672
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
2866
+ if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
26732867 return -EFAULT;
26742868
26752869 if (opt.max_window < opt.snd_wnd)
....@@ -2691,17 +2885,18 @@
26912885 return 0;
26922886 }
26932887
2694
-static int tcp_repair_options_est(struct sock *sk,
2695
- struct tcp_repair_opt __user *optbuf, unsigned int len)
2888
+static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
2889
+ unsigned int len)
26962890 {
26972891 struct tcp_sock *tp = tcp_sk(sk);
26982892 struct tcp_repair_opt opt;
2893
+ size_t offset = 0;
26992894
27002895 while (len >= sizeof(opt)) {
2701
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
2896
+ if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
27022897 return -EFAULT;
27032898
2704
- optbuf++;
2899
+ offset += sizeof(opt);
27052900 len -= sizeof(opt);
27062901
27072902 switch (opt.opt_code) {
....@@ -2740,11 +2935,183 @@
27402935 return 0;
27412936 }
27422937
2938
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2939
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
2940
+
2941
+static void tcp_enable_tx_delay(void)
2942
+{
2943
+ if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
2944
+ static int __tcp_tx_delay_enabled = 0;
2945
+
2946
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
2947
+ static_branch_enable(&tcp_tx_delay_enabled);
2948
+ pr_info("TCP_TX_DELAY enabled\n");
2949
+ }
2950
+ }
2951
+}
2952
+
2953
+/* When set indicates to always queue non-full frames. Later the user clears
2954
+ * this option and we transmit any pending partial frames in the queue. This is
2955
+ * meant to be used alongside sendfile() to get properly filled frames when the
2956
+ * user (for example) must write out headers with a write() call first and then
2957
+ * use sendfile to send out the data parts.
2958
+ *
2959
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
2960
+ * TCP_NODELAY.
2961
+ */
2962
+static void __tcp_sock_set_cork(struct sock *sk, bool on)
2963
+{
2964
+ struct tcp_sock *tp = tcp_sk(sk);
2965
+
2966
+ if (on) {
2967
+ tp->nonagle |= TCP_NAGLE_CORK;
2968
+ } else {
2969
+ tp->nonagle &= ~TCP_NAGLE_CORK;
2970
+ if (tp->nonagle & TCP_NAGLE_OFF)
2971
+ tp->nonagle |= TCP_NAGLE_PUSH;
2972
+ tcp_push_pending_frames(sk);
2973
+ }
2974
+}
2975
+
2976
+void tcp_sock_set_cork(struct sock *sk, bool on)
2977
+{
2978
+ lock_sock(sk);
2979
+ __tcp_sock_set_cork(sk, on);
2980
+ release_sock(sk);
2981
+}
2982
+EXPORT_SYMBOL(tcp_sock_set_cork);
2983
+
2984
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
2985
+ * remembered, but it is not activated until cork is cleared.
2986
+ *
2987
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
2988
+ * even TCP_CORK for currently queued segments.
2989
+ */
2990
+static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
2991
+{
2992
+ if (on) {
2993
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2994
+ tcp_push_pending_frames(sk);
2995
+ } else {
2996
+ tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
2997
+ }
2998
+}
2999
+
3000
+void tcp_sock_set_nodelay(struct sock *sk)
3001
+{
3002
+ lock_sock(sk);
3003
+ __tcp_sock_set_nodelay(sk, true);
3004
+ release_sock(sk);
3005
+}
3006
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
3007
+
3008
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
3009
+{
3010
+ if (!val) {
3011
+ inet_csk_enter_pingpong_mode(sk);
3012
+ return;
3013
+ }
3014
+
3015
+ inet_csk_exit_pingpong_mode(sk);
3016
+ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3017
+ inet_csk_ack_scheduled(sk)) {
3018
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3019
+ tcp_cleanup_rbuf(sk, 1);
3020
+ if (!(val & 1))
3021
+ inet_csk_enter_pingpong_mode(sk);
3022
+ }
3023
+}
3024
+
3025
+void tcp_sock_set_quickack(struct sock *sk, int val)
3026
+{
3027
+ lock_sock(sk);
3028
+ __tcp_sock_set_quickack(sk, val);
3029
+ release_sock(sk);
3030
+}
3031
+EXPORT_SYMBOL(tcp_sock_set_quickack);
3032
+
3033
+int tcp_sock_set_syncnt(struct sock *sk, int val)
3034
+{
3035
+ if (val < 1 || val > MAX_TCP_SYNCNT)
3036
+ return -EINVAL;
3037
+
3038
+ lock_sock(sk);
3039
+ inet_csk(sk)->icsk_syn_retries = val;
3040
+ release_sock(sk);
3041
+ return 0;
3042
+}
3043
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
3044
+
3045
+void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3046
+{
3047
+ lock_sock(sk);
3048
+ inet_csk(sk)->icsk_user_timeout = val;
3049
+ release_sock(sk);
3050
+}
3051
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3052
+
3053
+int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3054
+{
3055
+ struct tcp_sock *tp = tcp_sk(sk);
3056
+
3057
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
3058
+ return -EINVAL;
3059
+
3060
+ tp->keepalive_time = val * HZ;
3061
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
3062
+ !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3063
+ u32 elapsed = keepalive_time_elapsed(tp);
3064
+
3065
+ if (tp->keepalive_time > elapsed)
3066
+ elapsed = tp->keepalive_time - elapsed;
3067
+ else
3068
+ elapsed = 0;
3069
+ inet_csk_reset_keepalive_timer(sk, elapsed);
3070
+ }
3071
+
3072
+ return 0;
3073
+}
3074
+
3075
+int tcp_sock_set_keepidle(struct sock *sk, int val)
3076
+{
3077
+ int err;
3078
+
3079
+ lock_sock(sk);
3080
+ err = tcp_sock_set_keepidle_locked(sk, val);
3081
+ release_sock(sk);
3082
+ return err;
3083
+}
3084
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
3085
+
3086
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
3087
+{
3088
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
3089
+ return -EINVAL;
3090
+
3091
+ lock_sock(sk);
3092
+ tcp_sk(sk)->keepalive_intvl = val * HZ;
3093
+ release_sock(sk);
3094
+ return 0;
3095
+}
3096
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3097
+
3098
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
3099
+{
3100
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
3101
+ return -EINVAL;
3102
+
3103
+ lock_sock(sk);
3104
+ tcp_sk(sk)->keepalive_probes = val;
3105
+ release_sock(sk);
3106
+ return 0;
3107
+}
3108
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3109
+
27433110 /*
27443111 * Socket option code for TCP.
27453112 */
2746
-static int do_tcp_setsockopt(struct sock *sk, int level,
2747
- int optname, char __user *optval, unsigned int optlen)
3113
+static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3114
+ sockptr_t optval, unsigned int optlen)
27483115 {
27493116 struct tcp_sock *tp = tcp_sk(sk);
27503117 struct inet_connection_sock *icsk = inet_csk(sk);
....@@ -2760,14 +3127,14 @@
27603127 if (optlen < 1)
27613128 return -EINVAL;
27623129
2763
- val = strncpy_from_user(name, optval,
3130
+ val = strncpy_from_sockptr(name, optval,
27643131 min_t(long, TCP_CA_NAME_MAX-1, optlen));
27653132 if (val < 0)
27663133 return -EFAULT;
27673134 name[val] = 0;
27683135
27693136 lock_sock(sk);
2770
- err = tcp_set_congestion_control(sk, name, true, true,
3137
+ err = tcp_set_congestion_control(sk, name, true,
27713138 ns_capable(sock_net(sk)->user_ns,
27723139 CAP_NET_ADMIN));
27733140 release_sock(sk);
....@@ -2779,7 +3146,7 @@
27793146 if (optlen < 1)
27803147 return -EINVAL;
27813148
2782
- val = strncpy_from_user(name, optval,
3149
+ val = strncpy_from_sockptr(name, optval,
27833150 min_t(long, TCP_ULP_NAME_MAX - 1,
27843151 optlen));
27853152 if (val < 0)
....@@ -2792,15 +3159,23 @@
27923159 return err;
27933160 }
27943161 case TCP_FASTOPEN_KEY: {
2795
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
3162
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3163
+ __u8 *backup_key = NULL;
27963164
2797
- if (optlen != sizeof(key))
3165
+ /* Allow a backup key as well to facilitate key rotation
3166
+ * First key is the active one.
3167
+ */
3168
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3169
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
27983170 return -EINVAL;
27993171
2800
- if (copy_from_user(key, optval, optlen))
3172
+ if (copy_from_sockptr(key, optval, optlen))
28013173 return -EFAULT;
28023174
2803
- return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
3175
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3176
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3177
+
3178
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
28043179 }
28053180 default:
28063181 /* fallthru */
....@@ -2810,7 +3185,7 @@
28103185 if (optlen < sizeof(int))
28113186 return -EINVAL;
28123187
2813
- if (get_user(val, (int __user *)optval))
3188
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
28143189 return -EFAULT;
28153190
28163191 lock_sock(sk);
....@@ -2829,20 +3204,7 @@
28293204 break;
28303205
28313206 case TCP_NODELAY:
2832
- if (val) {
2833
- /* TCP_NODELAY is weaker than TCP_CORK, so that
2834
- * this option on corked socket is remembered, but
2835
- * it is not activated until cork is cleared.
2836
- *
2837
- * However, when TCP_NODELAY is set we make
2838
- * an explicit push, which overrides even TCP_CORK
2839
- * for currently queued segments.
2840
- */
2841
- tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2842
- tcp_push_pending_frames(sk);
2843
- } else {
2844
- tp->nonagle &= ~TCP_NAGLE_OFF;
2845
- }
3207
+ __tcp_sock_set_nodelay(sk, val);
28463208 break;
28473209
28483210 case TCP_THIN_LINEAR_TIMEOUTS:
....@@ -2908,52 +3270,18 @@
29083270 case TCP_REPAIR_OPTIONS:
29093271 if (!tp->repair)
29103272 err = -EINVAL;
2911
- else if (sk->sk_state == TCP_ESTABLISHED)
2912
- err = tcp_repair_options_est(sk,
2913
- (struct tcp_repair_opt __user *)optval,
2914
- optlen);
3273
+ else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
3274
+ err = tcp_repair_options_est(sk, optval, optlen);
29153275 else
29163276 err = -EPERM;
29173277 break;
29183278
29193279 case TCP_CORK:
2920
- /* When set indicates to always queue non-full frames.
2921
- * Later the user clears this option and we transmit
2922
- * any pending partial frames in the queue. This is
2923
- * meant to be used alongside sendfile() to get properly
2924
- * filled frames when the user (for example) must write
2925
- * out headers with a write() call first and then use
2926
- * sendfile to send out the data parts.
2927
- *
2928
- * TCP_CORK can be set together with TCP_NODELAY and it is
2929
- * stronger than TCP_NODELAY.
2930
- */
2931
- if (val) {
2932
- tp->nonagle |= TCP_NAGLE_CORK;
2933
- } else {
2934
- tp->nonagle &= ~TCP_NAGLE_CORK;
2935
- if (tp->nonagle&TCP_NAGLE_OFF)
2936
- tp->nonagle |= TCP_NAGLE_PUSH;
2937
- tcp_push_pending_frames(sk);
2938
- }
3280
+ __tcp_sock_set_cork(sk, val);
29393281 break;
29403282
29413283 case TCP_KEEPIDLE:
2942
- if (val < 1 || val > MAX_TCP_KEEPIDLE)
2943
- err = -EINVAL;
2944
- else {
2945
- tp->keepalive_time = val * HZ;
2946
- if (sock_flag(sk, SOCK_KEEPOPEN) &&
2947
- !((1 << sk->sk_state) &
2948
- (TCPF_CLOSE | TCPF_LISTEN))) {
2949
- u32 elapsed = keepalive_time_elapsed(tp);
2950
- if (tp->keepalive_time > elapsed)
2951
- elapsed = tp->keepalive_time - elapsed;
2952
- else
2953
- elapsed = 0;
2954
- inet_csk_reset_keepalive_timer(sk, elapsed);
2955
- }
2956
- }
3284
+ err = tcp_sock_set_keepidle_locked(sk, val);
29573285 break;
29583286 case TCP_KEEPINTVL:
29593287 if (val < 1 || val > MAX_TCP_KEEPINTVL)
....@@ -2975,7 +3303,8 @@
29753303 break;
29763304
29773305 case TCP_SAVE_SYN:
2978
- if (val < 0 || val > 1)
3306
+ /* 0: disable, 1: enable, 2: start from ether_header */
3307
+ if (val < 0 || val > 2)
29793308 err = -EINVAL;
29803309 else
29813310 tp->save_syn = val;
....@@ -2984,8 +3313,8 @@
29843313 case TCP_LINGER2:
29853314 if (val < 0)
29863315 tp->linger2 = -1;
2987
- else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
2988
- tp->linger2 = 0;
3316
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3317
+ tp->linger2 = TCP_FIN_TIMEOUT_MAX;
29893318 else
29903319 tp->linger2 = val * HZ;
29913320 break;
....@@ -3010,19 +3339,7 @@
30103339 break;
30113340
30123341 case TCP_QUICKACK:
3013
- if (!val) {
3014
- icsk->icsk_ack.pingpong = 1;
3015
- } else {
3016
- icsk->icsk_ack.pingpong = 0;
3017
- if ((1 << sk->sk_state) &
3018
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3019
- inet_csk_ack_scheduled(sk)) {
3020
- icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
3021
- tcp_cleanup_rbuf(sk, 1);
3022
- if (!(val & 1))
3023
- icsk->icsk_ack.pingpong = 1;
3024
- }
3025
- }
3342
+ __tcp_sock_set_quickack(sk, val);
30263343 break;
30273344
30283345 #ifdef CONFIG_TCP_MD5SIG
....@@ -3054,7 +3371,8 @@
30543371 case TCP_FASTOPEN_CONNECT:
30553372 if (val > 1 || val < 0) {
30563373 err = -EINVAL;
3057
- } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3374
+ } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
3375
+ TFO_CLIENT_ENABLE) {
30583376 if (sk->sk_state == TCP_CLOSE)
30593377 tp->fastopen_connect = val;
30603378 else
....@@ -3090,6 +3408,11 @@
30903408 else
30913409 tp->recvmsg_inq = val;
30923410 break;
3411
+ case TCP_TX_DELAY:
3412
+ if (val)
3413
+ tcp_enable_tx_delay();
3414
+ tp->tcp_tx_delay = val;
3415
+ break;
30933416 default:
30943417 err = -ENOPROTOOPT;
30953418 break;
....@@ -3099,7 +3422,7 @@
30993422 return err;
31003423 }
31013424
3102
-int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
3425
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
31033426 unsigned int optlen)
31043427 {
31053428 const struct inet_connection_sock *icsk = inet_csk(sk);
....@@ -3110,18 +3433,6 @@
31103433 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
31113434 }
31123435 EXPORT_SYMBOL(tcp_setsockopt);
3113
-
3114
-#ifdef CONFIG_COMPAT
3115
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
3116
- char __user *optval, unsigned int optlen)
3117
-{
3118
- if (level != SOL_TCP)
3119
- return inet_csk_compat_setsockopt(sk, level, optname,
3120
- optval, optlen);
3121
- return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3122
-}
3123
-EXPORT_SYMBOL(compat_tcp_setsockopt);
3124
-#endif
31253436
31263437 static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
31273438 struct tcp_info *info)
....@@ -3147,10 +3458,10 @@
31473458 {
31483459 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
31493460 const struct inet_connection_sock *icsk = inet_csk(sk);
3461
+ unsigned long rate;
31503462 u32 now;
31513463 u64 rate64;
31523464 bool slow;
3153
- u32 rate;
31543465
31553466 memset(info, 0, sizeof(*info));
31563467 if (sk->sk_type != SOCK_STREAM)
....@@ -3160,11 +3471,11 @@
31603471
31613472 /* Report meaningful fields for all TCP states, including listeners */
31623473 rate = READ_ONCE(sk->sk_pacing_rate);
3163
- rate64 = rate != ~0U ? rate : ~0ULL;
3474
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
31643475 info->tcpi_pacing_rate = rate64;
31653476
31663477 rate = READ_ONCE(sk->sk_max_pacing_rate);
3167
- rate64 = rate != ~0U ? rate : ~0ULL;
3478
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
31683479 info->tcpi_max_pacing_rate = rate64;
31693480
31703481 info->tcpi_reordering = tp->reordering;
....@@ -3175,8 +3486,8 @@
31753486 * tcpi_unacked -> Number of children ready for accept()
31763487 * tcpi_sacked -> max backlog
31773488 */
3178
- info->tcpi_unacked = sk->sk_ack_backlog;
3179
- info->tcpi_sacked = sk->sk_max_ack_backlog;
3489
+ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3490
+ info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
31803491 return;
31813492 }
31823493
....@@ -3254,6 +3565,9 @@
32543565 info->tcpi_bytes_retrans = tp->bytes_retrans;
32553566 info->tcpi_dsack_dups = tp->dsack_dups;
32563567 info->tcpi_reord_seen = tp->reord_seen;
3568
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3569
+ info->tcpi_snd_wnd = tp->snd_wnd;
3570
+ info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
32573571 unlock_sock_fast(sk, slow);
32583572 }
32593573 EXPORT_SYMBOL_GPL(tcp_get_info);
....@@ -3282,16 +3596,21 @@
32823596 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
32833597 nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
32843598 nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
3599
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
3600
+ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
3601
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
3602
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
32853603 0;
32863604 }
32873605
3288
-struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3606
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3607
+ const struct sk_buff *orig_skb)
32893608 {
32903609 const struct tcp_sock *tp = tcp_sk(sk);
32913610 struct sk_buff *stats;
32923611 struct tcp_info info;
3612
+ unsigned long rate;
32933613 u64 rate64;
3294
- u32 rate;
32953614
32963615 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
32973616 if (!stats)
....@@ -3310,7 +3629,7 @@
33103629 tp->total_retrans, TCP_NLA_PAD);
33113630
33123631 rate = READ_ONCE(sk->sk_pacing_rate);
3313
- rate64 = rate != ~0U ? rate : ~0ULL;
3632
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
33143633 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
33153634
33163635 rate64 = tcp_compute_delivery_rate(tp);
....@@ -3335,6 +3654,12 @@
33353654 TCP_NLA_PAD);
33363655 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
33373656 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3657
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3658
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3659
+ nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3660
+ max_t(int, 0, tp->write_seq - tp->snd_nxt));
3661
+ nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3662
+ TCP_NLA_PAD);
33383663
33393664 return stats;
33403665 }
....@@ -3384,7 +3709,7 @@
33843709 case TCP_LINGER2:
33853710 val = tp->linger2;
33863711 if (val >= 0)
3387
- val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3712
+ val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
33883713 break;
33893714 case TCP_DEFER_ACCEPT:
33903715 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
....@@ -3429,7 +3754,7 @@
34293754 return 0;
34303755 }
34313756 case TCP_QUICKACK:
3432
- val = !icsk->icsk_ack.pingpong;
3757
+ val = !inet_csk_in_pingpong_mode(sk);
34333758 break;
34343759
34353760 case TCP_CONGESTION:
....@@ -3458,21 +3783,15 @@
34583783 return 0;
34593784
34603785 case TCP_FASTOPEN_KEY: {
3461
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
3462
- struct tcp_fastopen_context *ctx;
3786
+ u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
3787
+ unsigned int key_len;
34633788
34643789 if (get_user(len, optlen))
34653790 return -EFAULT;
34663791
3467
- rcu_read_lock();
3468
- ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
3469
- if (ctx)
3470
- memcpy(key, ctx->key, sizeof(key));
3471
- else
3472
- len = 0;
3473
- rcu_read_unlock();
3474
-
3475
- len = min_t(unsigned int, len, sizeof(key));
3792
+ key_len = tcp_fastopen_get_cipher(net, icsk, key) *
3793
+ TCP_FASTOPEN_KEY_LENGTH;
3794
+ len = min_t(unsigned int, len, key_len);
34763795 if (put_user(len, optlen))
34773796 return -EFAULT;
34783797 if (copy_to_user(optval, key, len))
....@@ -3545,6 +3864,10 @@
35453864 val = tp->fastopen_no_cookie;
35463865 break;
35473866
3867
+ case TCP_TX_DELAY:
3868
+ val = tp->tcp_tx_delay;
3869
+ break;
3870
+
35483871 case TCP_TIMESTAMP:
35493872 val = tcp_time_stamp_raw() + tp->tsoffset;
35503873 break;
....@@ -3563,20 +3886,21 @@
35633886
35643887 lock_sock(sk);
35653888 if (tp->saved_syn) {
3566
- if (len < tp->saved_syn[0]) {
3567
- if (put_user(tp->saved_syn[0], optlen)) {
3889
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
3890
+ if (put_user(tcp_saved_syn_len(tp->saved_syn),
3891
+ optlen)) {
35683892 release_sock(sk);
35693893 return -EFAULT;
35703894 }
35713895 release_sock(sk);
35723896 return -EINVAL;
35733897 }
3574
- len = tp->saved_syn[0];
3898
+ len = tcp_saved_syn_len(tp->saved_syn);
35753899 if (put_user(len, optlen)) {
35763900 release_sock(sk);
35773901 return -EFAULT;
35783902 }
3579
- if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3903
+ if (copy_to_user(optval, tp->saved_syn->data, len)) {
35803904 release_sock(sk);
35813905 return -EFAULT;
35823906 }
....@@ -3592,18 +3916,41 @@
35923916 }
35933917 #ifdef CONFIG_MMU
35943918 case TCP_ZEROCOPY_RECEIVE: {
3595
- struct tcp_zerocopy_receive zc;
3919
+ struct tcp_zerocopy_receive zc = {};
35963920 int err;
35973921
35983922 if (get_user(len, optlen))
35993923 return -EFAULT;
3600
- if (len != sizeof(zc))
3924
+ if (len < 0 ||
3925
+ len < offsetofend(struct tcp_zerocopy_receive, length))
36013926 return -EINVAL;
3927
+ if (len > sizeof(zc)) {
3928
+ len = sizeof(zc);
3929
+ if (put_user(len, optlen))
3930
+ return -EFAULT;
3931
+ }
36023932 if (copy_from_user(&zc, optval, len))
36033933 return -EFAULT;
36043934 lock_sock(sk);
36053935 err = tcp_zerocopy_receive(sk, &zc);
36063936 release_sock(sk);
3937
+ if (len >= offsetofend(struct tcp_zerocopy_receive, err))
3938
+ goto zerocopy_rcv_sk_err;
3939
+ switch (len) {
3940
+ case offsetofend(struct tcp_zerocopy_receive, err):
3941
+ goto zerocopy_rcv_sk_err;
3942
+ case offsetofend(struct tcp_zerocopy_receive, inq):
3943
+ goto zerocopy_rcv_inq;
3944
+ case offsetofend(struct tcp_zerocopy_receive, length):
3945
+ default:
3946
+ goto zerocopy_rcv_out;
3947
+ }
3948
+zerocopy_rcv_sk_err:
3949
+ if (!err)
3950
+ zc.err = sock_error(sk);
3951
+zerocopy_rcv_inq:
3952
+ zc.inq = tcp_inq_hint(sk);
3953
+zerocopy_rcv_out:
36073954 if (!err && copy_to_user(optval, &zc, len))
36083955 err = -EFAULT;
36093956 return err;
....@@ -3631,18 +3978,6 @@
36313978 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
36323979 }
36333980 EXPORT_SYMBOL(tcp_getsockopt);
3634
-
3635
-#ifdef CONFIG_COMPAT
3636
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3637
- char __user *optval, int __user *optlen)
3638
-{
3639
- if (level != SOL_TCP)
3640
- return inet_csk_compat_getsockopt(sk, level, optname,
3641
- optval, optlen);
3642
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3643
-}
3644
-EXPORT_SYMBOL(compat_tcp_getsockopt);
3645
-#endif
36463981
36473982 #ifdef CONFIG_TCP_MD5SIG
36483983 static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
....@@ -3686,20 +4021,28 @@
36864021 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
36874022 */
36884023 smp_wmb();
3689
- tcp_md5sig_pool_populated = true;
4024
+ /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
4025
+ * and tcp_get_md5sig_pool().
4026
+ */
4027
+ WRITE_ONCE(tcp_md5sig_pool_populated, true);
36904028 }
36914029
36924030 bool tcp_alloc_md5sig_pool(void)
36934031 {
3694
- if (unlikely(!tcp_md5sig_pool_populated)) {
4032
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4033
+ if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
36954034 mutex_lock(&tcp_md5sig_mutex);
36964035
3697
- if (!tcp_md5sig_pool_populated)
4036
+ if (!tcp_md5sig_pool_populated) {
36984037 __tcp_alloc_md5sig_pool();
4038
+ if (tcp_md5sig_pool_populated)
4039
+ static_branch_inc(&tcp_md5_needed);
4040
+ }
36994041
37004042 mutex_unlock(&tcp_md5sig_mutex);
37014043 }
3702
- return tcp_md5sig_pool_populated;
4044
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4045
+ return READ_ONCE(tcp_md5sig_pool_populated);
37034046 }
37044047 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
37054048
....@@ -3715,7 +4058,8 @@
37154058 {
37164059 local_bh_disable();
37174060
3718
- if (tcp_md5sig_pool_populated) {
4061
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4062
+ if (READ_ONCE(tcp_md5sig_pool_populated)) {
37194063 /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
37204064 smp_rmb();
37214065 return this_cpu_ptr(&tcp_md5sig_pool);
....@@ -3745,8 +4089,8 @@
37454089 return 1;
37464090
37474091 for (i = 0; i < shi->nr_frags; ++i) {
3748
- const struct skb_frag_struct *f = &shi->frags[i];
3749
- unsigned int offset = f->page_offset;
4092
+ const skb_frag_t *f = &shi->frags[i];
4093
+ unsigned int offset = skb_frag_off(f);
37504094 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
37514095
37524096 sg_set_page(&sg, page, skb_frag_size(f),
....@@ -3772,8 +4116,8 @@
37724116 sg_init_one(&sg, key->key, keylen);
37734117 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
37744118
3775
- /* tcp_md5_do_add() might change key->key under us */
3776
- return crypto_ahash_update(hp->md5_req);
4119
+ /* We use data_race() because tcp_md5_do_add() might change key->key under us */
4120
+ return data_race(crypto_ahash_update(hp->md5_req));
37774121 }
37784122 EXPORT_SYMBOL(tcp_md5_hash_key);
37794123
....@@ -3781,7 +4125,13 @@
37814125
37824126 void tcp_done(struct sock *sk)
37834127 {
3784
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
4128
+ struct request_sock *req;
4129
+
4130
+ /* We might be called with a new socket, after
4131
+ * inet_csk_prepare_forced_close() has been called
4132
+ * so we can not use lockdep_sock_is_held(sk)
4133
+ */
4134
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
37854135
37864136 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
37874137 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
....@@ -3880,7 +4230,7 @@
38804230
38814231 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
38824232 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3883
- FIELD_SIZEOF(struct sk_buff, cb));
4233
+ sizeof_field(struct sk_buff, cb));
38844234
38854235 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
38864236 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
....@@ -3954,4 +4304,5 @@
39544304 tcp_metrics_init();
39554305 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
39564306 tcp_tasklet_init();
4307
+ mptcp_init();
39574308 }