hc
2024-01-05 071106ecf68c401173c58808b1cf5f68cc50d390
kernel/include/net/tcp.h
....@@ -1,3 +1,4 @@
1
+/* SPDX-License-Identifier: GPL-2.0-or-later */
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -9,11 +10,6 @@
910 *
1011 * Authors: Ross Biro
1112 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12
- *
13
- * This program is free software; you can redistribute it and/or
14
- * modify it under the terms of the GNU General Public License
15
- * as published by the Free Software Foundation; either version
16
- * 2 of the License, or (at your option) any later version.
1713 */
1814 #ifndef _TCP_H
1915 #define _TCP_H
....@@ -27,9 +23,9 @@
2723 #include <linux/cache.h>
2824 #include <linux/percpu.h>
2925 #include <linux/skbuff.h>
30
-#include <linux/cryptohash.h>
3126 #include <linux/kref.h>
3227 #include <linux/ktime.h>
28
+#include <linux/indirect_call_wrapper.h>
3329
3430 #include <net/inet_connection_sock.h>
3531 #include <net/inet_timewait_sock.h>
....@@ -43,10 +39,12 @@
4339 #include <net/tcp_states.h>
4440 #include <net/inet_ecn.h>
4541 #include <net/dst.h>
42
+#include <net/mptcp.h>
4643
4744 #include <linux/seq_file.h>
4845 #include <linux/memcontrol.h>
4946 #include <linux/bpf-cgroup.h>
47
+#include <linux/siphash.h>
5048
5149 extern struct inet_hashinfo tcp_hashinfo;
5250
....@@ -67,7 +65,7 @@
6765 /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
6866 #define TCP_MIN_MSS 88U
6967
70
-/* The least MTU to use for probing */
68
+/* The initial MTU to use for probing */
7169 #define TCP_BASE_MSS 1024
7270
7371 /* probing interval, default to 10 minutes as per RFC4821 */
....@@ -128,6 +126,7 @@
128126 * to combine FIN-WAIT-2 timeout with
129127 * TIME-WAIT timer.
130128 */
129
+#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
131130
132131 #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
133132 #if HZ >= 100
....@@ -185,6 +184,7 @@
185184 #define TCPOPT_SACK 5 /* SACK Block */
186185 #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
187186 #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
187
+#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */
188188 #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
189189 #define TCPOPT_EXP 254 /* Experimental */
190190 /* Magic number to be after the option value for sharing TCP
....@@ -315,7 +315,7 @@
315315
316316 void tcp_tasklet_init(void);
317317
318
-void tcp_v4_err(struct sk_buff *skb, u32);
318
+int tcp_v4_err(struct sk_buff *skb, u32);
319319
320320 void tcp_shutdown(struct sock *sk, int how);
321321
....@@ -331,6 +331,9 @@
331331 size_t size, int flags);
332332 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
333333 size_t size, int flags);
334
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
335
+void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
336
+ int size_goal);
334337 void tcp_release_cb(struct sock *sk);
335338 void tcp_wfree(struct sk_buff *skb);
336339 void tcp_write_timer_handler(struct sock *sk);
....@@ -345,13 +348,14 @@
345348 struct pipe_inode_info *pipe, size_t len,
346349 unsigned int flags);
347350
348
-void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
349
-static inline void tcp_dec_quickack_mode(struct sock *sk,
350
- const unsigned int pkts)
351
+static inline void tcp_dec_quickack_mode(struct sock *sk)
351352 {
352353 struct inet_connection_sock *icsk = inet_csk(sk);
353354
354355 if (icsk->icsk_ack.quick) {
356
+ /* How many ACKs S/ACKing new data have we sent? */
357
+ const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0;
358
+
355359 if (pkts >= icsk->icsk_ack.quick) {
356360 icsk->icsk_ack.quick = 0;
357361 /* Leaving quickack mode we deflate ATO. */
....@@ -389,32 +393,41 @@
389393 void tcp_init_metrics(struct sock *sk);
390394 void tcp_metrics_init(void);
391395 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
396
+void __tcp_close(struct sock *sk, long timeout);
392397 void tcp_close(struct sock *sk, long timeout);
393398 void tcp_init_sock(struct sock *sk);
394
-void tcp_init_transfer(struct sock *sk, int bpf_op);
399
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
395400 __poll_t tcp_poll(struct file *file, struct socket *sock,
396401 struct poll_table_struct *wait);
397402 int tcp_getsockopt(struct sock *sk, int level, int optname,
398403 char __user *optval, int __user *optlen);
399
-int tcp_setsockopt(struct sock *sk, int level, int optname,
400
- char __user *optval, unsigned int optlen);
401
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
402
- char __user *optval, int __user *optlen);
403
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
404
- char __user *optval, unsigned int optlen);
404
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
405
+ unsigned int optlen);
405406 void tcp_set_keepalive(struct sock *sk, int val);
406407 void tcp_syn_ack_timeout(const struct request_sock *req);
407408 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
408409 int flags, int *addr_len);
409410 int tcp_set_rcvlowat(struct sock *sk, int val);
410411 void tcp_data_ready(struct sock *sk);
412
+#ifdef CONFIG_MMU
411413 int tcp_mmap(struct file *file, struct socket *sock,
412414 struct vm_area_struct *vma);
415
+#endif
413416 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
414417 struct tcp_options_received *opt_rx,
415418 int estab, struct tcp_fastopen_cookie *foc);
416419 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
417420
421
+/*
422
+ * BPF SKB-less helpers
423
+ */
424
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
425
+ struct tcphdr *th, u32 *cookie);
426
+u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
427
+ struct tcphdr *th, u32 *cookie);
428
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
429
+ const struct tcp_request_sock_ops *af_ops,
430
+ struct sock *sk, struct tcphdr *th);
418431 /*
419432 * TCP v4 functions exported for the inet6 API
420433 */
....@@ -422,6 +435,7 @@
422435 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
423436 void tcp_v4_mtu_reduced(struct sock *sk);
424437 void tcp_req_err(struct sock *sk, u32 seq, bool abort);
438
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
425439 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
426440 struct sock *tcp_create_openreq_child(const struct sock *sk,
427441 struct request_sock *req,
....@@ -443,7 +457,8 @@
443457 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
444458 struct request_sock *req,
445459 struct tcp_fastopen_cookie *foc,
446
- enum tcp_synack_type synack_type);
460
+ enum tcp_synack_type synack_type,
461
+ struct sk_buff *syn_skb);
447462 int tcp_disconnect(struct sock *sk, int flags);
448463
449464 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
....@@ -457,6 +472,9 @@
457472 int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
458473 u32 cookie);
459474 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
475
+struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
476
+ const struct tcp_request_sock_ops *af_ops,
477
+ struct sock *sk, struct sk_buff *skb);
460478 #ifdef CONFIG_SYN_COOKIES
461479
462480 /* Syncookies use a monotonic timer which increments every 60 seconds.
....@@ -539,7 +557,7 @@
539557 u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
540558 u16 *mssp);
541559 __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
542
-u64 cookie_init_timestamp(struct request_sock *req);
560
+u64 cookie_init_timestamp(struct request_sock *req, u64 now);
543561 bool cookie_timestamp_decode(const struct net *net,
544562 struct tcp_options_received *opt);
545563 bool cookie_ecn_ok(const struct tcp_options_received *opt,
....@@ -594,6 +612,7 @@
594612 void tcp_reset(struct sock *sk);
595613 void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
596614 void tcp_fin(struct sock *sk);
615
+void tcp_check_space(struct sock *sk);
597616
598617 /* tcp_timer.c */
599618 void tcp_init_xmit_timers(struct sock *);
....@@ -610,6 +629,7 @@
610629
611630 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
612631 unsigned int tcp_current_mss(struct sock *sk);
632
+u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
613633
614634 /* Bound MSS / TSO packet size with the half of the window */
615635 static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
....@@ -646,7 +666,6 @@
646666 int tcp_mtu_to_mss(struct sock *sk, int pmtu);
647667 int tcp_mss_to_mtu(struct sock *sk, int mss);
648668 void tcp_mtup_init(struct sock *sk);
649
-void tcp_init_buffer_space(struct sock *sk);
650669
651670 static inline void tcp_bound_rto(const struct sock *sk)
652671 {
....@@ -661,6 +680,10 @@
661680
662681 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
663682 {
683
+ /* mptcp hooks are only on the slow path */
684
+ if (sk_is_mptcp((struct sock *)tp))
685
+ return;
686
+
664687 tp->pred_flags = htonl((tp->tcp_header_len << 26) |
665688 ntohl(TCP_FLAG_ACK) |
666689 snd_wnd);
....@@ -686,7 +709,7 @@
686709 static inline u32 tcp_rto_min(struct sock *sk)
687710 {
688711 const struct dst_entry *dst = __sk_dst_get(sk);
689
- u32 rto_min = TCP_RTO_MIN;
712
+ u32 rto_min = inet_csk(sk)->icsk_rto_min;
690713
691714 if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
692715 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
....@@ -745,7 +768,7 @@
745768
746769 static inline u64 tcp_clock_ns(void)
747770 {
748
- return local_clock();
771
+ return ktime_get_ns();
749772 }
750773
751774 static inline u64 tcp_clock_us(void)
....@@ -759,23 +782,19 @@
759782 return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
760783 }
761784
785
+/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
786
+static inline u32 tcp_ns_to_ts(u64 ns)
787
+{
788
+ return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
789
+}
790
+
762791 /* Could use tcp_clock_us() / 1000, but this version uses a single divide */
763792 static inline u32 tcp_time_stamp_raw(void)
764793 {
765
- return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
794
+ return tcp_ns_to_ts(tcp_clock_ns());
766795 }
767796
768
-
769
-/* Refresh 1us clock of a TCP socket,
770
- * ensuring monotically increasing values.
771
- */
772
-static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
773
-{
774
- u64 val = tcp_clock_us();
775
-
776
- if (val > tp->tcp_mstamp)
777
- tp->tcp_mstamp = val;
778
-}
797
+void tcp_mstamp_refresh(struct tcp_sock *tp);
779798
780799 static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
781800 {
....@@ -784,7 +803,13 @@
784803
785804 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
786805 {
787
- return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
806
+ return tcp_ns_to_ts(skb->skb_mstamp_ns);
807
+}
808
+
809
+/* provide the departure time in us unit */
810
+static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
811
+{
812
+ return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
788813 }
789814
790815
....@@ -830,7 +855,7 @@
830855 #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
831856 #define TCPCB_LOST 0x04 /* SKB is lost */
832857 #define TCPCB_TAGBITS 0x07 /* All tag bits */
833
-#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */
858
+#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */
834859 #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
835860 #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
836861 TCPCB_REPAIRED)
....@@ -875,6 +900,23 @@
875900 TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
876901 }
877902
903
+static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
904
+{
905
+ return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
906
+}
907
+
908
+static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
909
+{
910
+ return TCP_SKB_CB(skb)->bpf.sk_redir;
911
+}
912
+
913
+static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
914
+{
915
+ TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
916
+}
917
+
918
+extern const struct inet_connection_sock_af_ops ipv4_specific;
919
+
878920 #if IS_ENABLED(CONFIG_IPV6)
879921 /* This is the variant of inet6_iif() that must be used by TCP,
880922 * as TCP moves IP6CB into a different location in skb->cb[]
....@@ -900,17 +942,14 @@
900942 #endif
901943 return 0;
902944 }
903
-#endif
904945
905
-static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
906
-{
907
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
908
- if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
909
- skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
910
- return true;
946
+extern const struct inet_connection_sock_af_ops ipv6_specific;
947
+
948
+INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
949
+INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
950
+void tcp_v6_early_demux(struct sk_buff *skb);
951
+
911952 #endif
912
- return false;
913
-}
914953
915954 /* TCP_SKB_CB reference means this can not be used from early demux */
916955 static inline int tcp_v4_sdif(struct sk_buff *skb)
....@@ -951,6 +990,13 @@
951990 return likely(!TCP_SKB_CB(skb)->eor);
952991 }
953992
993
+static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
994
+ const struct sk_buff *from)
995
+{
996
+ return likely(tcp_skb_can_collapse_to(to) &&
997
+ mptcp_skb_can_collapse(to, from));
998
+}
999
+
9541000 /* Events passed to congestion control interface */
9551001 enum tcp_ca_event {
9561002 CA_EVENT_TX_START, /* first transmit when no packets in flight */
....@@ -981,6 +1027,7 @@
9811027 #define TCP_CONG_NON_RESTRICTED 0x1
9821028 /* Requires ECN/ECT set on all packets */
9831029 #define TCP_CONG_NEEDS_ECN 0x2
1030
+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
9841031
9851032 union tcp_cc_info;
9861033
....@@ -1066,7 +1113,7 @@
10661113 void tcp_get_allowed_congestion_control(char *buf, size_t len);
10671114 int tcp_set_allowed_congestion_control(char *allowed);
10681115 int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
1069
- bool reinit, bool cap_net_admin);
1116
+ bool cap_net_admin);
10701117 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
10711118 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
10721119
....@@ -1075,6 +1122,7 @@
10751122 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
10761123 extern struct tcp_congestion_ops tcp_reno;
10771124
1125
+struct tcp_congestion_ops *tcp_ca_find(const char *name);
10781126 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
10791127 u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
10801128 #ifdef CONFIG_INET
....@@ -1127,7 +1175,7 @@
11271175 */
11281176 static inline int tcp_is_sack(const struct tcp_sock *tp)
11291177 {
1130
- return tp->rx_opt.sack_ok;
1178
+ return likely(tp->rx_opt.sack_ok);
11311179 }
11321180
11331181 static inline bool tcp_is_reno(const struct tcp_sock *tp)
....@@ -1230,11 +1278,14 @@
12301278 {
12311279 const struct tcp_sock *tp = tcp_sk(sk);
12321280
1281
+ if (tp->is_cwnd_limited)
1282
+ return true;
1283
+
12331284 /* If in slow start, ensure cwnd grows to twice what was ACKed. */
12341285 if (tcp_in_slow_start(tp))
12351286 return tp->snd_cwnd < 2 * tp->max_packets_out;
12361287
1237
- return tp->is_cwnd_limited;
1288
+ return false;
12381289 }
12391290
12401291 /* BBR congestion control needs pacing.
....@@ -1248,8 +1299,27 @@
12481299 return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
12491300 }
12501301
1302
+/* Estimates in how many jiffies next packet for this flow can be sent.
1303
+ * Scheduling a retransmit timer too early would be silly.
1304
+ */
1305
+static inline unsigned long tcp_pacing_delay(const struct sock *sk)
1306
+{
1307
+ s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;
1308
+
1309
+ return delay > 0 ? nsecs_to_jiffies(delay) : 0;
1310
+}
1311
+
1312
+static inline void tcp_reset_xmit_timer(struct sock *sk,
1313
+ const int what,
1314
+ unsigned long when,
1315
+ const unsigned long max_when)
1316
+{
1317
+ inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
1318
+ max_when);
1319
+}
1320
+
12511321 /* Something is really bad, we could not queue an additional packet,
1252
- * because qdisc is full or receiver sent a 0 window.
1322
+ * because qdisc is full or receiver sent a 0 window, or we are paced.
12531323 * We do not want to add fuel to the fire, or abort too early,
12541324 * so make sure the timer we arm now is at least 200ms in the future,
12551325 * regardless of current icsk_rto value (as it could be ~2ms)
....@@ -1271,8 +1341,8 @@
12711341 static inline void tcp_check_probe_timer(struct sock *sk)
12721342 {
12731343 if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
1274
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
1275
- tcp_probe0_base(sk), TCP_RTO_MAX);
1344
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
1345
+ tcp_probe0_base(sk), TCP_RTO_MAX);
12761346 }
12771347
12781348 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
....@@ -1291,36 +1361,19 @@
12911361 static inline __sum16 tcp_v4_check(int len, __be32 saddr,
12921362 __be32 daddr, __wsum base)
12931363 {
1294
- return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1295
-}
1296
-
1297
-static inline __sum16 __tcp_checksum_complete(struct sk_buff *skb)
1298
-{
1299
- return __skb_checksum_complete(skb);
1364
+ return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
13001365 }
13011366
13021367 static inline bool tcp_checksum_complete(struct sk_buff *skb)
13031368 {
13041369 return !skb_csum_unnecessary(skb) &&
1305
- __tcp_checksum_complete(skb);
1370
+ __skb_checksum_complete(skb);
13061371 }
13071372
13081373 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
13091374 int tcp_filter(struct sock *sk, struct sk_buff *skb);
1310
-
1311
-#undef STATE_TRACE
1312
-
1313
-#ifdef STATE_TRACE
1314
-static const char *statename[]={
1315
- "Unused","Established","Syn Sent","Syn Recv",
1316
- "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
1317
- "Close Wait","Last ACK","Listen","Closing"
1318
-};
1319
-#endif
13201375 void tcp_set_state(struct sock *sk, int state);
1321
-
13221376 void tcp_done(struct sock *sk);
1323
-
13241377 int tcp_abort(struct sock *sk, int err);
13251378
13261379 static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
....@@ -1329,7 +1382,6 @@
13291382 rx_opt->num_sacks = 0;
13301383 }
13311384
1332
-u32 tcp_default_init_rwnd(u32 mss);
13331385 void tcp_cwnd_restart(struct sock *sk, s32 delta);
13341386
13351387 static inline void tcp_slow_start_after_idle_check(struct sock *sk)
....@@ -1338,8 +1390,8 @@
13381390 struct tcp_sock *tp = tcp_sk(sk);
13391391 s32 delta;
13401392
1341
- if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
1342
- ca_ops->cong_control)
1393
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) ||
1394
+ tp->packets_out || ca_ops->cong_control)
13431395 return;
13441396 delta = tcp_jiffies32 - tp->lsndtime;
13451397 if (delta > inet_csk(sk)->icsk_rto)
....@@ -1354,7 +1406,7 @@
13541406
13551407 static inline int tcp_win_from_space(const struct sock *sk, int space)
13561408 {
1357
- int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
1409
+ int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
13581410
13591411 return tcp_adv_win_scale <= 0 ?
13601412 (space>>(-tcp_adv_win_scale)) :
....@@ -1364,14 +1416,17 @@
13641416 /* Note: caller must be prepared to deal with negative returns */
13651417 static inline int tcp_space(const struct sock *sk)
13661418 {
1367
- return tcp_win_from_space(sk, sk->sk_rcvbuf - sk->sk_backlog.len -
1419
+ return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
1420
+ READ_ONCE(sk->sk_backlog.len) -
13681421 atomic_read(&sk->sk_rmem_alloc));
13691422 }
13701423
13711424 static inline int tcp_full_space(const struct sock *sk)
13721425 {
1373
- return tcp_win_from_space(sk, sk->sk_rcvbuf);
1426
+ return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
13741427 }
1428
+
1429
+void tcp_cleanup_rbuf(struct sock *sk, int copied);
13751430
13761431 /* We provision sk_rcvbuf around 200% of sk_rcvlowat.
13771432 * If 87.5 % (7/8) of the space has been consumed, we want to override
....@@ -1401,22 +1456,38 @@
14011456 static inline int keepalive_intvl_when(const struct tcp_sock *tp)
14021457 {
14031458 struct net *net = sock_net((struct sock *)tp);
1459
+ int val;
14041460
1405
- return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl;
1461
+ /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl()
1462
+ * and do_tcp_setsockopt().
1463
+ */
1464
+ val = READ_ONCE(tp->keepalive_intvl);
1465
+
1466
+ return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl);
14061467 }
14071468
14081469 static inline int keepalive_time_when(const struct tcp_sock *tp)
14091470 {
14101471 struct net *net = sock_net((struct sock *)tp);
1472
+ int val;
14111473
1412
- return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time;
1474
+ /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */
1475
+ val = READ_ONCE(tp->keepalive_time);
1476
+
1477
+ return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
14131478 }
14141479
14151480 static inline int keepalive_probes(const struct tcp_sock *tp)
14161481 {
14171482 struct net *net = sock_net((struct sock *)tp);
1483
+ int val;
14181484
1419
- return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes;
1485
+ /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt()
1486
+ * and do_tcp_setsockopt().
1487
+ */
1488
+ val = READ_ONCE(tp->keepalive_probes);
1489
+
1490
+ return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes);
14201491 }
14211492
14221493 static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
....@@ -1429,7 +1500,8 @@
14291500
14301501 static inline int tcp_fin_time(const struct sock *sk)
14311502 {
1432
- int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout;
1503
+ int fin_timeout = tcp_sk(sk)->linger2 ? :
1504
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout);
14331505 const int rto = inet_csk(sk)->icsk_rto;
14341506
14351507 if (fin_timeout < (rto << 2) - (rto >> 1))
....@@ -1516,8 +1588,9 @@
15161588 struct hlist_node node;
15171589 u8 keylen;
15181590 u8 family; /* AF_INET or AF_INET6 */
1519
- union tcp_md5_addr addr;
15201591 u8 prefixlen;
1592
+ union tcp_md5_addr addr;
1593
+ int l3index; /* set if key added with L3 scope */
15211594 u8 key[TCP_MD5SIG_MAXKEYLEN];
15221595 struct rcu_head rcu;
15231596 };
....@@ -1561,22 +1634,33 @@
15611634 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
15621635 const struct sock *sk, const struct sk_buff *skb);
15631636 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1564
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1565
- gfp_t gfp);
1637
+ int family, u8 prefixlen, int l3index,
1638
+ const u8 *newkey, u8 newkeylen, gfp_t gfp);
15661639 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
1567
- int family, u8 prefixlen);
1640
+ int family, u8 prefixlen, int l3index);
15681641 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
15691642 const struct sock *addr_sk);
15701643
15711644 #ifdef CONFIG_TCP_MD5SIG
1572
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
1573
- const union tcp_md5_addr *addr,
1574
- int family);
1645
+#include <linux/jump_label.h>
1646
+extern struct static_key_false tcp_md5_needed;
1647
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1648
+ const union tcp_md5_addr *addr,
1649
+ int family);
1650
+static inline struct tcp_md5sig_key *
1651
+tcp_md5_do_lookup(const struct sock *sk, int l3index,
1652
+ const union tcp_md5_addr *addr, int family)
1653
+{
1654
+ if (!static_branch_unlikely(&tcp_md5_needed))
1655
+ return NULL;
1656
+ return __tcp_md5_do_lookup(sk, l3index, addr, family);
1657
+}
1658
+
15751659 #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key)
15761660 #else
1577
-static inline struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
1578
- const union tcp_md5_addr *addr,
1579
- int family)
1661
+static inline struct tcp_md5sig_key *
1662
+tcp_md5_do_lookup(const struct sock *sk, int l3index,
1663
+ const union tcp_md5_addr *addr, int family)
15801664 {
15811665 return NULL;
15821666 }
....@@ -1608,12 +1692,15 @@
16081692 struct msghdr *data; /* data in MSG_FASTOPEN */
16091693 size_t size;
16101694 int copied; /* queued in tcp_connect() */
1695
+ struct ubuf_info *uarg;
16111696 };
16121697 void tcp_free_fastopen_req(struct tcp_sock *tp);
16131698 void tcp_fastopen_destroy_cipher(struct sock *sk);
16141699 void tcp_fastopen_ctx_destroy(struct net *net);
16151700 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
1616
- void *key, unsigned int len);
1701
+ void *primary_key, void *backup_key);
1702
+int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
1703
+ u64 *key);
16171704 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
16181705 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
16191706 struct request_sock *req,
....@@ -1623,13 +1710,16 @@
16231710 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
16241711 struct tcp_fastopen_cookie *cookie);
16251712 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
1626
-#define TCP_FASTOPEN_KEY_LENGTH 16
1713
+#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
1714
+#define TCP_FASTOPEN_KEY_MAX 2
1715
+#define TCP_FASTOPEN_KEY_BUF_LENGTH \
1716
+ (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)
16271717
16281718 /* Fastopen key context */
16291719 struct tcp_fastopen_context {
1630
- struct crypto_cipher *tfm;
1631
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
1632
- struct rcu_head rcu;
1720
+ siphash_key_t key[TCP_FASTOPEN_KEY_MAX];
1721
+ int num;
1722
+ struct rcu_head rcu;
16331723 };
16341724
16351725 extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
....@@ -1637,6 +1727,35 @@
16371727 bool tcp_fastopen_active_should_disable(struct sock *sk);
16381728 void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
16391729 void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);
1730
+
1731
+/* Caller needs to wrap with rcu_read_(un)lock() */
1732
+static inline
1733
+struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
1734
+{
1735
+ struct tcp_fastopen_context *ctx;
1736
+
1737
+ ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
1738
+ if (!ctx)
1739
+ ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
1740
+ return ctx;
1741
+}
1742
+
1743
+static inline
1744
+bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
1745
+ const struct tcp_fastopen_cookie *orig)
1746
+{
1747
+ if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
1748
+ orig->len == foc->len &&
1749
+ !memcmp(orig->val, foc->val, foc->len))
1750
+ return true;
1751
+ return false;
1752
+}
1753
+
1754
+static inline
1755
+int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
1756
+{
1757
+ return ctx->num;
1758
+}
16401759
16411760 /* Latencies incurred by various limits for a sender. They are
16421761 * chronograph-like stats that are mutually exclusive.
....@@ -1705,9 +1824,18 @@
17051824 return skb_queue_is_last(&sk->sk_write_queue, skb);
17061825 }
17071826
1827
+/**
1828
+ * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
1829
+ * @sk: socket
1830
+ *
1831
+ * Since the write queue can have a temporary empty skb in it,
1832
+ * we must not use "return skb_queue_empty(&sk->sk_write_queue)"
1833
+ */
17081834 static inline bool tcp_write_queue_empty(const struct sock *sk)
17091835 {
1710
- return skb_queue_empty(&sk->sk_write_queue);
1836
+ const struct tcp_sock *tp = tcp_sk(sk);
1837
+
1838
+ return tp->write_seq == tp->snd_nxt;
17111839 }
17121840
17131841 static inline bool tcp_rtx_queue_empty(const struct sock *sk)
....@@ -1720,20 +1848,9 @@
17201848 return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
17211849 }
17221850
1723
-static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
1724
-{
1725
- if (tcp_write_queue_empty(sk))
1726
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1727
-}
1728
-
1729
-static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
1730
-{
1731
- __skb_queue_tail(&sk->sk_write_queue, skb);
1732
-}
1733
-
17341851 static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
17351852 {
1736
- __tcp_add_write_queue_tail(sk, skb);
1853
+ __skb_queue_tail(&sk->sk_write_queue, skb);
17371854
17381855 /* Queue it, remembering where we must start sending. */
17391856 if (sk->sk_write_queue.next == skb)
....@@ -1855,6 +1972,7 @@
18551972 struct seq_net_private p;
18561973 enum tcp_seq_states state;
18571974 struct sock *syn_wait_sk;
1975
+ struct tcp_seq_afinfo *bpf_seq_afinfo;
18581976 int bucket, offset, sbucket, num;
18591977 loff_t last_pos;
18601978 };
....@@ -1867,6 +1985,10 @@
18671985 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
18681986 netdev_features_t features);
18691987 struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
1988
+INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
1989
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
1990
+INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
1991
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
18701992 int tcp_gro_complete(struct sk_buff *skb);
18711993
18721994 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
....@@ -1874,15 +1996,24 @@
18741996 static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
18751997 {
18761998 struct net *net = sock_net((struct sock *)tp);
1877
- return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
1999
+ u32 val;
2000
+
2001
+ val = READ_ONCE(tp->notsent_lowat);
2002
+
2003
+ return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
18782004 }
18792005
1880
-static inline bool tcp_stream_memory_free(const struct sock *sk)
2006
+/* @wake is one when sk_stream_write_space() calls us.
2007
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
2008
+ * This mimics the strategy used in sock_def_write_space().
2009
+ */
2010
+static inline bool tcp_stream_memory_free(const struct sock *sk, int wake)
18812011 {
18822012 const struct tcp_sock *tp = tcp_sk(sk);
1883
- u32 notsent_bytes = READ_ONCE(tp->write_seq) - tp->snd_nxt;
2013
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2014
+ READ_ONCE(tp->snd_nxt);
18842015
1885
- return notsent_bytes < tcp_notsent_lowat(tp);
2016
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
18862017 }
18872018
18882019 #ifdef CONFIG_PROC_FS
....@@ -1906,7 +2037,7 @@
19062037 const struct sk_buff *skb);
19072038 int (*md5_parse)(struct sock *sk,
19082039 int optname,
1909
- char __user *optval,
2040
+ sockptr_t optval,
19102041 int optlen);
19112042 #endif
19122043 };
....@@ -1935,8 +2066,14 @@
19352066 int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
19362067 struct flowi *fl, struct request_sock *req,
19372068 struct tcp_fastopen_cookie *foc,
1938
- enum tcp_synack_type synack_type);
2069
+ enum tcp_synack_type synack_type,
2070
+ struct sk_buff *syn_skb);
19392071 };
2072
+
2073
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
2074
+#if IS_ENABLED(CONFIG_IPV6)
2075
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
2076
+#endif
19402077
19412078 #ifdef CONFIG_SYN_COOKIES
19422079 static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
....@@ -1977,7 +2114,7 @@
19772114 {
19782115 const struct sk_buff *skb = tcp_rtx_queue_head(sk);
19792116 u32 rto = inet_csk(sk)->icsk_rto;
1980
- u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
2117
+ u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);
19812118
19822119 return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
19832120 }
....@@ -2077,34 +2214,72 @@
20772214 #define TCP_ULP_MAX 128
20782215 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX)
20792216
2080
-enum {
2081
- TCP_ULP_TLS,
2082
- TCP_ULP_BPF,
2083
-};
2084
-
20852217 struct tcp_ulp_ops {
20862218 struct list_head list;
20872219
20882220 /* initialize ulp */
20892221 int (*init)(struct sock *sk);
2222
+ /* update ulp */
2223
+ void (*update)(struct sock *sk, struct proto *p,
2224
+ void (*write_space)(struct sock *sk));
20902225 /* cleanup ulp */
20912226 void (*release)(struct sock *sk);
2227
+ /* diagnostic */
2228
+ int (*get_info)(const struct sock *sk, struct sk_buff *skb);
2229
+ size_t (*get_info_size)(const struct sock *sk);
2230
+ /* clone ulp */
2231
+ void (*clone)(const struct request_sock *req, struct sock *newsk,
2232
+ const gfp_t priority);
20922233
2093
- int uid;
20942234 char name[TCP_ULP_NAME_MAX];
2095
- bool user_visible;
20962235 struct module *owner;
20972236 };
20982237 int tcp_register_ulp(struct tcp_ulp_ops *type);
20992238 void tcp_unregister_ulp(struct tcp_ulp_ops *type);
21002239 int tcp_set_ulp(struct sock *sk, const char *name);
2101
-int tcp_set_ulp_id(struct sock *sk, const int ulp);
21022240 void tcp_get_available_ulp(char *buf, size_t len);
21032241 void tcp_cleanup_ulp(struct sock *sk);
2242
+void tcp_update_ulp(struct sock *sk, struct proto *p,
2243
+ void (*write_space)(struct sock *sk));
21042244
21052245 #define MODULE_ALIAS_TCP_ULP(name) \
21062246 __MODULE_INFO(alias, alias_userspace, name); \
21072247 __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
2248
+
2249
+struct sk_msg;
2250
+struct sk_psock;
2251
+
2252
+#ifdef CONFIG_BPF_STREAM_PARSER
2253
+struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
2254
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
2255
+#else
2256
+static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
2257
+{
2258
+}
2259
+#endif /* CONFIG_BPF_STREAM_PARSER */
2260
+
2261
+#ifdef CONFIG_NET_SOCK_MSG
2262
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
2263
+ int flags);
2264
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
2265
+ struct msghdr *msg, int len, int flags);
2266
+#endif /* CONFIG_NET_SOCK_MSG */
2267
+
2268
+#ifdef CONFIG_CGROUP_BPF
2269
+static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
2270
+ struct sk_buff *skb,
2271
+ unsigned int end_offset)
2272
+{
2273
+ skops->skb = skb;
2274
+ skops->skb_data_end = skb->data + end_offset;
2275
+}
2276
+#else
2277
+static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
2278
+ struct sk_buff *skb,
2279
+ unsigned int end_offset)
2280
+{
2281
+}
2282
+#endif
21082283
21092284 /* Call BPF_SOCK_OPS program that returns an int. If the return value
21102285 * is < 0, then the BPF op failed (for example if the loaded BPF
....@@ -2197,6 +2372,12 @@
21972372 return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
21982373 }
21992374
2375
+static inline void tcp_bpf_rtt(struct sock *sk)
2376
+{
2377
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
2378
+ tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
2379
+}
2380
+
22002381 #if IS_ENABLED(CONFIG_SMC)
22012382 extern struct static_key_false tcp_have_smc;
22022383 #endif
....@@ -2205,7 +2386,29 @@
22052386 void clean_acked_data_enable(struct inet_connection_sock *icsk,
22062387 void (*cad)(struct sock *sk, u32 ack_seq));
22072388 void clean_acked_data_disable(struct inet_connection_sock *icsk);
2208
-
2389
+void clean_acked_data_flush(void);
22092390 #endif
22102391
2392
+DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2393
+static inline void tcp_add_tx_delay(struct sk_buff *skb,
2394
+ const struct tcp_sock *tp)
2395
+{
2396
+ if (static_branch_unlikely(&tcp_tx_delay_enabled))
2397
+ skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
2398
+}
2399
+
2400
+/* Compute Earliest Departure Time for some control packets
2401
+ * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
2402
+ */
2403
+static inline u64 tcp_transmit_time(const struct sock *sk)
2404
+{
2405
+ if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
2406
+ u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
2407
+ tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
2408
+
2409
+ return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
2410
+ }
2411
+ return 0;
2412
+}
2413
+
22112414 #endif /* _TCP_H */