hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/include/net/tcp.h
....@@ -1,3 +1,4 @@
1
+/* SPDX-License-Identifier: GPL-2.0-or-later */
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -9,11 +10,6 @@
910 *
1011 * Authors: Ross Biro
1112 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12
- *
13
- * This program is free software; you can redistribute it and/or
14
- * modify it under the terms of the GNU General Public License
15
- * as published by the Free Software Foundation; either version
16
- * 2 of the License, or (at your option) any later version.
1713 */
1814 #ifndef _TCP_H
1915 #define _TCP_H
....@@ -27,9 +23,9 @@
2723 #include <linux/cache.h>
2824 #include <linux/percpu.h>
2925 #include <linux/skbuff.h>
30
-#include <linux/cryptohash.h>
3126 #include <linux/kref.h>
3227 #include <linux/ktime.h>
28
+#include <linux/indirect_call_wrapper.h>
3329
3430 #include <net/inet_connection_sock.h>
3531 #include <net/inet_timewait_sock.h>
....@@ -43,10 +39,12 @@
4339 #include <net/tcp_states.h>
4440 #include <net/inet_ecn.h>
4541 #include <net/dst.h>
42
+#include <net/mptcp.h>
4643
4744 #include <linux/seq_file.h>
4845 #include <linux/memcontrol.h>
4946 #include <linux/bpf-cgroup.h>
47
+#include <linux/siphash.h>
5048
5149 extern struct inet_hashinfo tcp_hashinfo;
5250
....@@ -67,7 +65,7 @@
6765 /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
6866 #define TCP_MIN_MSS 88U
6967
70
-/* The least MTU to use for probing */
68
+/* The initial MTU to use for probing */
7169 #define TCP_BASE_MSS 1024
7270
7371 /* probing interval, default to 10 minutes as per RFC4821 */
....@@ -128,6 +126,7 @@
128126 * to combine FIN-WAIT-2 timeout with
129127 * TIME-WAIT timer.
130128 */
129
+#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
131130
132131 #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
133132 #if HZ >= 100
....@@ -185,6 +184,7 @@
185184 #define TCPOPT_SACK 5 /* SACK Block */
186185 #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
187186 #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
187
+#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */
188188 #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
189189 #define TCPOPT_EXP 254 /* Experimental */
190190 /* Magic number to be after the option value for sharing TCP
....@@ -315,7 +315,7 @@
315315
316316 void tcp_tasklet_init(void);
317317
318
-void tcp_v4_err(struct sk_buff *skb, u32);
318
+int tcp_v4_err(struct sk_buff *skb, u32);
319319
320320 void tcp_shutdown(struct sock *sk, int how);
321321
....@@ -331,6 +331,9 @@
331331 size_t size, int flags);
332332 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
333333 size_t size, int flags);
334
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
335
+void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
336
+ int size_goal);
334337 void tcp_release_cb(struct sock *sk);
335338 void tcp_wfree(struct sk_buff *skb);
336339 void tcp_write_timer_handler(struct sock *sk);
....@@ -391,30 +394,38 @@
391394 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
392395 void tcp_close(struct sock *sk, long timeout);
393396 void tcp_init_sock(struct sock *sk);
394
-void tcp_init_transfer(struct sock *sk, int bpf_op);
397
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
395398 __poll_t tcp_poll(struct file *file, struct socket *sock,
396399 struct poll_table_struct *wait);
397400 int tcp_getsockopt(struct sock *sk, int level, int optname,
398401 char __user *optval, int __user *optlen);
399
-int tcp_setsockopt(struct sock *sk, int level, int optname,
400
- char __user *optval, unsigned int optlen);
401
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
402
- char __user *optval, int __user *optlen);
403
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
404
- char __user *optval, unsigned int optlen);
402
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
403
+ unsigned int optlen);
405404 void tcp_set_keepalive(struct sock *sk, int val);
406405 void tcp_syn_ack_timeout(const struct request_sock *req);
407406 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
408407 int flags, int *addr_len);
409408 int tcp_set_rcvlowat(struct sock *sk, int val);
410409 void tcp_data_ready(struct sock *sk);
410
+#ifdef CONFIG_MMU
411411 int tcp_mmap(struct file *file, struct socket *sock,
412412 struct vm_area_struct *vma);
413
+#endif
413414 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
414415 struct tcp_options_received *opt_rx,
415416 int estab, struct tcp_fastopen_cookie *foc);
416417 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
417418
419
+/*
420
+ * BPF SKB-less helpers
421
+ */
422
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
423
+ struct tcphdr *th, u32 *cookie);
424
+u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
425
+ struct tcphdr *th, u32 *cookie);
426
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
427
+ const struct tcp_request_sock_ops *af_ops,
428
+ struct sock *sk, struct tcphdr *th);
418429 /*
419430 * TCP v4 functions exported for the inet6 API
420431 */
....@@ -422,6 +433,7 @@
422433 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
423434 void tcp_v4_mtu_reduced(struct sock *sk);
424435 void tcp_req_err(struct sock *sk, u32 seq, bool abort);
436
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
425437 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
426438 struct sock *tcp_create_openreq_child(const struct sock *sk,
427439 struct request_sock *req,
....@@ -443,7 +455,8 @@
443455 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
444456 struct request_sock *req,
445457 struct tcp_fastopen_cookie *foc,
446
- enum tcp_synack_type synack_type);
458
+ enum tcp_synack_type synack_type,
459
+ struct sk_buff *syn_skb);
447460 int tcp_disconnect(struct sock *sk, int flags);
448461
449462 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
....@@ -457,6 +470,9 @@
457470 int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
458471 u32 cookie);
459472 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
473
+struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
474
+ const struct tcp_request_sock_ops *af_ops,
475
+ struct sock *sk, struct sk_buff *skb);
460476 #ifdef CONFIG_SYN_COOKIES
461477
462478 /* Syncookies use a monotonic timer which increments every 60 seconds.
....@@ -539,7 +555,7 @@
539555 u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
540556 u16 *mssp);
541557 __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
542
-u64 cookie_init_timestamp(struct request_sock *req);
558
+u64 cookie_init_timestamp(struct request_sock *req, u64 now);
543559 bool cookie_timestamp_decode(const struct net *net,
544560 struct tcp_options_received *opt);
545561 bool cookie_ecn_ok(const struct tcp_options_received *opt,
....@@ -594,6 +610,7 @@
594610 void tcp_reset(struct sock *sk);
595611 void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
596612 void tcp_fin(struct sock *sk);
613
+void tcp_check_space(struct sock *sk);
597614
598615 /* tcp_timer.c */
599616 void tcp_init_xmit_timers(struct sock *);
....@@ -610,6 +627,7 @@
610627
611628 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
612629 unsigned int tcp_current_mss(struct sock *sk);
630
+u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
613631
614632 /* Bound MSS / TSO packet size with the half of the window */
615633 static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
....@@ -646,7 +664,6 @@
646664 int tcp_mtu_to_mss(struct sock *sk, int pmtu);
647665 int tcp_mss_to_mtu(struct sock *sk, int mss);
648666 void tcp_mtup_init(struct sock *sk);
649
-void tcp_init_buffer_space(struct sock *sk);
650667
651668 static inline void tcp_bound_rto(const struct sock *sk)
652669 {
....@@ -661,6 +678,10 @@
661678
662679 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
663680 {
681
+ /* mptcp hooks are only on the slow path */
682
+ if (sk_is_mptcp((struct sock *)tp))
683
+ return;
684
+
664685 tp->pred_flags = htonl((tp->tcp_header_len << 26) |
665686 ntohl(TCP_FLAG_ACK) |
666687 snd_wnd);
....@@ -686,7 +707,7 @@
686707 static inline u32 tcp_rto_min(struct sock *sk)
687708 {
688709 const struct dst_entry *dst = __sk_dst_get(sk);
689
- u32 rto_min = TCP_RTO_MIN;
710
+ u32 rto_min = inet_csk(sk)->icsk_rto_min;
690711
691712 if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
692713 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
....@@ -745,7 +766,7 @@
745766
746767 static inline u64 tcp_clock_ns(void)
747768 {
748
- return local_clock();
769
+ return ktime_get_ns();
749770 }
750771
751772 static inline u64 tcp_clock_us(void)
....@@ -759,23 +780,19 @@
759780 return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
760781 }
761782
783
+/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
784
+static inline u32 tcp_ns_to_ts(u64 ns)
785
+{
786
+ return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
787
+}
788
+
762789 /* Could use tcp_clock_us() / 1000, but this version uses a single divide */
763790 static inline u32 tcp_time_stamp_raw(void)
764791 {
765
- return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
792
+ return tcp_ns_to_ts(tcp_clock_ns());
766793 }
767794
768
-
769
-/* Refresh 1us clock of a TCP socket,
770
- * ensuring monotically increasing values.
771
- */
772
-static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
773
-{
774
- u64 val = tcp_clock_us();
775
-
776
- if (val > tp->tcp_mstamp)
777
- tp->tcp_mstamp = val;
778
-}
795
+void tcp_mstamp_refresh(struct tcp_sock *tp);
779796
780797 static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
781798 {
....@@ -784,7 +801,13 @@
784801
785802 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
786803 {
787
- return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
804
+ return tcp_ns_to_ts(skb->skb_mstamp_ns);
805
+}
806
+
807
+/* provide the departure time in us unit */
808
+static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
809
+{
810
+ return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
788811 }
789812
790813
....@@ -830,7 +853,7 @@
830853 #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
831854 #define TCPCB_LOST 0x04 /* SKB is lost */
832855 #define TCPCB_TAGBITS 0x07 /* All tag bits */
833
-#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */
856
+#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */
834857 #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
835858 #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
836859 TCPCB_REPAIRED)
....@@ -875,6 +898,23 @@
875898 TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
876899 }
877900
901
+static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
902
+{
903
+ return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
904
+}
905
+
906
+static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
907
+{
908
+ return TCP_SKB_CB(skb)->bpf.sk_redir;
909
+}
910
+
911
+static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
912
+{
913
+ TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
914
+}
915
+
916
+extern const struct inet_connection_sock_af_ops ipv4_specific;
917
+
878918 #if IS_ENABLED(CONFIG_IPV6)
879919 /* This is the variant of inet6_iif() that must be used by TCP,
880920 * as TCP moves IP6CB into a different location in skb->cb[]
....@@ -900,17 +940,14 @@
900940 #endif
901941 return 0;
902942 }
903
-#endif
904943
905
-static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
906
-{
907
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
908
- if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
909
- skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
910
- return true;
944
+extern const struct inet_connection_sock_af_ops ipv6_specific;
945
+
946
+INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
947
+INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
948
+void tcp_v6_early_demux(struct sk_buff *skb);
949
+
911950 #endif
912
- return false;
913
-}
914951
915952 /* TCP_SKB_CB reference means this can not be used from early demux */
916953 static inline int tcp_v4_sdif(struct sk_buff *skb)
....@@ -951,6 +988,13 @@
951988 return likely(!TCP_SKB_CB(skb)->eor);
952989 }
953990
991
+static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
992
+ const struct sk_buff *from)
993
+{
994
+ return likely(tcp_skb_can_collapse_to(to) &&
995
+ mptcp_skb_can_collapse(to, from));
996
+}
997
+
954998 /* Events passed to congestion control interface */
955999 enum tcp_ca_event {
9561000 CA_EVENT_TX_START, /* first transmit when no packets in flight */
....@@ -981,6 +1025,7 @@
9811025 #define TCP_CONG_NON_RESTRICTED 0x1
9821026 /* Requires ECN/ECT set on all packets */
9831027 #define TCP_CONG_NEEDS_ECN 0x2
1028
+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
9841029
9851030 union tcp_cc_info;
9861031
....@@ -1066,7 +1111,7 @@
10661111 void tcp_get_allowed_congestion_control(char *buf, size_t len);
10671112 int tcp_set_allowed_congestion_control(char *allowed);
10681113 int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
1069
- bool reinit, bool cap_net_admin);
1114
+ bool cap_net_admin);
10701115 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
10711116 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
10721117
....@@ -1075,6 +1120,7 @@
10751120 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
10761121 extern struct tcp_congestion_ops tcp_reno;
10771122
1123
+struct tcp_congestion_ops *tcp_ca_find(const char *name);
10781124 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
10791125 u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
10801126 #ifdef CONFIG_INET
....@@ -1127,7 +1173,7 @@
11271173 */
11281174 static inline int tcp_is_sack(const struct tcp_sock *tp)
11291175 {
1130
- return tp->rx_opt.sack_ok;
1176
+ return likely(tp->rx_opt.sack_ok);
11311177 }
11321178
11331179 static inline bool tcp_is_reno(const struct tcp_sock *tp)
....@@ -1230,11 +1276,14 @@
12301276 {
12311277 const struct tcp_sock *tp = tcp_sk(sk);
12321278
1279
+ if (tp->is_cwnd_limited)
1280
+ return true;
1281
+
12331282 /* If in slow start, ensure cwnd grows to twice what was ACKed. */
12341283 if (tcp_in_slow_start(tp))
12351284 return tp->snd_cwnd < 2 * tp->max_packets_out;
12361285
1237
- return tp->is_cwnd_limited;
1286
+ return false;
12381287 }
12391288
12401289 /* BBR congestion control needs pacing.
....@@ -1248,8 +1297,27 @@
12481297 return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
12491298 }
12501299
1300
+/* Estimates in how many jiffies next packet for this flow can be sent.
1301
+ * Scheduling a retransmit timer too early would be silly.
1302
+ */
1303
+static inline unsigned long tcp_pacing_delay(const struct sock *sk)
1304
+{
1305
+ s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;
1306
+
1307
+ return delay > 0 ? nsecs_to_jiffies(delay) : 0;
1308
+}
1309
+
1310
+static inline void tcp_reset_xmit_timer(struct sock *sk,
1311
+ const int what,
1312
+ unsigned long when,
1313
+ const unsigned long max_when)
1314
+{
1315
+ inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
1316
+ max_when);
1317
+}
1318
+
12511319 /* Something is really bad, we could not queue an additional packet,
1252
- * because qdisc is full or receiver sent a 0 window.
1320
+ * because qdisc is full or receiver sent a 0 window, or we are paced.
12531321 * We do not want to add fuel to the fire, or abort too early,
12541322 * so make sure the timer we arm now is at least 200ms in the future,
12551323 * regardless of current icsk_rto value (as it could be ~2ms)
....@@ -1271,8 +1339,8 @@
12711339 static inline void tcp_check_probe_timer(struct sock *sk)
12721340 {
12731341 if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
1274
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
1275
- tcp_probe0_base(sk), TCP_RTO_MAX);
1342
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
1343
+ tcp_probe0_base(sk), TCP_RTO_MAX);
12761344 }
12771345
12781346 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
....@@ -1291,36 +1359,19 @@
12911359 static inline __sum16 tcp_v4_check(int len, __be32 saddr,
12921360 __be32 daddr, __wsum base)
12931361 {
1294
- return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1295
-}
1296
-
1297
-static inline __sum16 __tcp_checksum_complete(struct sk_buff *skb)
1298
-{
1299
- return __skb_checksum_complete(skb);
1362
+ return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
13001363 }
13011364
13021365 static inline bool tcp_checksum_complete(struct sk_buff *skb)
13031366 {
13041367 return !skb_csum_unnecessary(skb) &&
1305
- __tcp_checksum_complete(skb);
1368
+ __skb_checksum_complete(skb);
13061369 }
13071370
13081371 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
13091372 int tcp_filter(struct sock *sk, struct sk_buff *skb);
1310
-
1311
-#undef STATE_TRACE
1312
-
1313
-#ifdef STATE_TRACE
1314
-static const char *statename[]={
1315
- "Unused","Established","Syn Sent","Syn Recv",
1316
- "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
1317
- "Close Wait","Last ACK","Listen","Closing"
1318
-};
1319
-#endif
13201373 void tcp_set_state(struct sock *sk, int state);
1321
-
13221374 void tcp_done(struct sock *sk);
1323
-
13241375 int tcp_abort(struct sock *sk, int err);
13251376
13261377 static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
....@@ -1329,7 +1380,6 @@
13291380 rx_opt->num_sacks = 0;
13301381 }
13311382
1332
-u32 tcp_default_init_rwnd(u32 mss);
13331383 void tcp_cwnd_restart(struct sock *sk, s32 delta);
13341384
13351385 static inline void tcp_slow_start_after_idle_check(struct sock *sk)
....@@ -1338,8 +1388,8 @@
13381388 struct tcp_sock *tp = tcp_sk(sk);
13391389 s32 delta;
13401390
1341
- if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
1342
- ca_ops->cong_control)
1391
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) ||
1392
+ tp->packets_out || ca_ops->cong_control)
13431393 return;
13441394 delta = tcp_jiffies32 - tp->lsndtime;
13451395 if (delta > inet_csk(sk)->icsk_rto)
....@@ -1354,7 +1404,7 @@
13541404
13551405 static inline int tcp_win_from_space(const struct sock *sk, int space)
13561406 {
1357
- int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
1407
+ int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
13581408
13591409 return tcp_adv_win_scale <= 0 ?
13601410 (space>>(-tcp_adv_win_scale)) :
....@@ -1364,14 +1414,17 @@
13641414 /* Note: caller must be prepared to deal with negative returns */
13651415 static inline int tcp_space(const struct sock *sk)
13661416 {
1367
- return tcp_win_from_space(sk, sk->sk_rcvbuf - sk->sk_backlog.len -
1417
+ return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
1418
+ READ_ONCE(sk->sk_backlog.len) -
13681419 atomic_read(&sk->sk_rmem_alloc));
13691420 }
13701421
13711422 static inline int tcp_full_space(const struct sock *sk)
13721423 {
1373
- return tcp_win_from_space(sk, sk->sk_rcvbuf);
1424
+ return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
13741425 }
1426
+
1427
+void tcp_cleanup_rbuf(struct sock *sk, int copied);
13751428
13761429 /* We provision sk_rcvbuf around 200% of sk_rcvlowat.
13771430 * If 87.5 % (7/8) of the space has been consumed, we want to override
....@@ -1402,21 +1455,24 @@
14021455 {
14031456 struct net *net = sock_net((struct sock *)tp);
14041457
1405
- return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl;
1458
+ return tp->keepalive_intvl ? :
1459
+ READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl);
14061460 }
14071461
14081462 static inline int keepalive_time_when(const struct tcp_sock *tp)
14091463 {
14101464 struct net *net = sock_net((struct sock *)tp);
14111465
1412
- return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time;
1466
+ return tp->keepalive_time ? :
1467
+ READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
14131468 }
14141469
14151470 static inline int keepalive_probes(const struct tcp_sock *tp)
14161471 {
14171472 struct net *net = sock_net((struct sock *)tp);
14181473
1419
- return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes;
1474
+ return tp->keepalive_probes ? :
1475
+ READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes);
14201476 }
14211477
14221478 static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
....@@ -1429,7 +1485,8 @@
14291485
14301486 static inline int tcp_fin_time(const struct sock *sk)
14311487 {
1432
- int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout;
1488
+ int fin_timeout = tcp_sk(sk)->linger2 ? :
1489
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout);
14331490 const int rto = inet_csk(sk)->icsk_rto;
14341491
14351492 if (fin_timeout < (rto << 2) - (rto >> 1))
....@@ -1516,8 +1573,9 @@
15161573 struct hlist_node node;
15171574 u8 keylen;
15181575 u8 family; /* AF_INET or AF_INET6 */
1519
- union tcp_md5_addr addr;
15201576 u8 prefixlen;
1577
+ union tcp_md5_addr addr;
1578
+ int l3index; /* set if key added with L3 scope */
15211579 u8 key[TCP_MD5SIG_MAXKEYLEN];
15221580 struct rcu_head rcu;
15231581 };
....@@ -1561,22 +1619,33 @@
15611619 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
15621620 const struct sock *sk, const struct sk_buff *skb);
15631621 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1564
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1565
- gfp_t gfp);
1622
+ int family, u8 prefixlen, int l3index,
1623
+ const u8 *newkey, u8 newkeylen, gfp_t gfp);
15661624 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
1567
- int family, u8 prefixlen);
1625
+ int family, u8 prefixlen, int l3index);
15681626 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
15691627 const struct sock *addr_sk);
15701628
15711629 #ifdef CONFIG_TCP_MD5SIG
1572
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
1573
- const union tcp_md5_addr *addr,
1574
- int family);
1630
+#include <linux/jump_label.h>
1631
+extern struct static_key_false tcp_md5_needed;
1632
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1633
+ const union tcp_md5_addr *addr,
1634
+ int family);
1635
+static inline struct tcp_md5sig_key *
1636
+tcp_md5_do_lookup(const struct sock *sk, int l3index,
1637
+ const union tcp_md5_addr *addr, int family)
1638
+{
1639
+ if (!static_branch_unlikely(&tcp_md5_needed))
1640
+ return NULL;
1641
+ return __tcp_md5_do_lookup(sk, l3index, addr, family);
1642
+}
1643
+
15751644 #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key)
15761645 #else
1577
-static inline struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
1578
- const union tcp_md5_addr *addr,
1579
- int family)
1646
+static inline struct tcp_md5sig_key *
1647
+tcp_md5_do_lookup(const struct sock *sk, int l3index,
1648
+ const union tcp_md5_addr *addr, int family)
15801649 {
15811650 return NULL;
15821651 }
....@@ -1608,12 +1677,15 @@
16081677 struct msghdr *data; /* data in MSG_FASTOPEN */
16091678 size_t size;
16101679 int copied; /* queued in tcp_connect() */
1680
+ struct ubuf_info *uarg;
16111681 };
16121682 void tcp_free_fastopen_req(struct tcp_sock *tp);
16131683 void tcp_fastopen_destroy_cipher(struct sock *sk);
16141684 void tcp_fastopen_ctx_destroy(struct net *net);
16151685 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
1616
- void *key, unsigned int len);
1686
+ void *primary_key, void *backup_key);
1687
+int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
1688
+ u64 *key);
16171689 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
16181690 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
16191691 struct request_sock *req,
....@@ -1623,13 +1695,16 @@
16231695 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
16241696 struct tcp_fastopen_cookie *cookie);
16251697 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
1626
-#define TCP_FASTOPEN_KEY_LENGTH 16
1698
+#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
1699
+#define TCP_FASTOPEN_KEY_MAX 2
1700
+#define TCP_FASTOPEN_KEY_BUF_LENGTH \
1701
+ (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)
16271702
16281703 /* Fastopen key context */
16291704 struct tcp_fastopen_context {
1630
- struct crypto_cipher *tfm;
1631
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
1632
- struct rcu_head rcu;
1705
+ siphash_key_t key[TCP_FASTOPEN_KEY_MAX];
1706
+ int num;
1707
+ struct rcu_head rcu;
16331708 };
16341709
16351710 extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
....@@ -1637,6 +1712,35 @@
16371712 bool tcp_fastopen_active_should_disable(struct sock *sk);
16381713 void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
16391714 void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);
1715
+
1716
+/* Caller needs to wrap with rcu_read_(un)lock() */
1717
+static inline
1718
+struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
1719
+{
1720
+ struct tcp_fastopen_context *ctx;
1721
+
1722
+ ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
1723
+ if (!ctx)
1724
+ ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
1725
+ return ctx;
1726
+}
1727
+
1728
+static inline
1729
+bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
1730
+ const struct tcp_fastopen_cookie *orig)
1731
+{
1732
+ if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
1733
+ orig->len == foc->len &&
1734
+ !memcmp(orig->val, foc->val, foc->len))
1735
+ return true;
1736
+ return false;
1737
+}
1738
+
1739
+static inline
1740
+int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
1741
+{
1742
+ return ctx->num;
1743
+}
16401744
16411745 /* Latencies incurred by various limits for a sender. They are
16421746 * chronograph-like stats that are mutually exclusive.
....@@ -1705,9 +1809,18 @@
17051809 return skb_queue_is_last(&sk->sk_write_queue, skb);
17061810 }
17071811
1812
+/**
1813
+ * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
1814
+ * @sk: socket
1815
+ *
1816
+ * Since the write queue can have a temporary empty skb in it,
1817
+ * we must not use "return skb_queue_empty(&sk->sk_write_queue)"
1818
+ */
17081819 static inline bool tcp_write_queue_empty(const struct sock *sk)
17091820 {
1710
- return skb_queue_empty(&sk->sk_write_queue);
1821
+ const struct tcp_sock *tp = tcp_sk(sk);
1822
+
1823
+ return tp->write_seq == tp->snd_nxt;
17111824 }
17121825
17131826 static inline bool tcp_rtx_queue_empty(const struct sock *sk)
....@@ -1720,20 +1833,9 @@
17201833 return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
17211834 }
17221835
1723
-static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
1724
-{
1725
- if (tcp_write_queue_empty(sk))
1726
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1727
-}
1728
-
1729
-static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
1730
-{
1731
- __skb_queue_tail(&sk->sk_write_queue, skb);
1732
-}
1733
-
17341836 static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
17351837 {
1736
- __tcp_add_write_queue_tail(sk, skb);
1838
+ __skb_queue_tail(&sk->sk_write_queue, skb);
17371839
17381840 /* Queue it, remembering where we must start sending. */
17391841 if (sk->sk_write_queue.next == skb)
....@@ -1855,6 +1957,7 @@
18551957 struct seq_net_private p;
18561958 enum tcp_seq_states state;
18571959 struct sock *syn_wait_sk;
1960
+ struct tcp_seq_afinfo *bpf_seq_afinfo;
18581961 int bucket, offset, sbucket, num;
18591962 loff_t last_pos;
18601963 };
....@@ -1867,6 +1970,10 @@
18671970 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
18681971 netdev_features_t features);
18691972 struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
1973
+INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
1974
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
1975
+INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
1976
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
18701977 int tcp_gro_complete(struct sk_buff *skb);
18711978
18721979 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
....@@ -1874,15 +1981,20 @@
18741981 static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
18751982 {
18761983 struct net *net = sock_net((struct sock *)tp);
1877
- return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
1984
+ return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
18781985 }
18791986
1880
-static inline bool tcp_stream_memory_free(const struct sock *sk)
1987
+/* @wake is one when sk_stream_write_space() calls us.
1988
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
1989
+ * This mimics the strategy used in sock_def_write_space().
1990
+ */
1991
+static inline bool tcp_stream_memory_free(const struct sock *sk, int wake)
18811992 {
18821993 const struct tcp_sock *tp = tcp_sk(sk);
1883
- u32 notsent_bytes = READ_ONCE(tp->write_seq) - tp->snd_nxt;
1994
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) -
1995
+ READ_ONCE(tp->snd_nxt);
18841996
1885
- return notsent_bytes < tcp_notsent_lowat(tp);
1997
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
18861998 }
18871999
18882000 #ifdef CONFIG_PROC_FS
....@@ -1906,7 +2018,7 @@
19062018 const struct sk_buff *skb);
19072019 int (*md5_parse)(struct sock *sk,
19082020 int optname,
1909
- char __user *optval,
2021
+ sockptr_t optval,
19102022 int optlen);
19112023 #endif
19122024 };
....@@ -1935,8 +2047,14 @@
19352047 int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
19362048 struct flowi *fl, struct request_sock *req,
19372049 struct tcp_fastopen_cookie *foc,
1938
- enum tcp_synack_type synack_type);
2050
+ enum tcp_synack_type synack_type,
2051
+ struct sk_buff *syn_skb);
19392052 };
2053
+
2054
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
2055
+#if IS_ENABLED(CONFIG_IPV6)
2056
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
2057
+#endif
19402058
19412059 #ifdef CONFIG_SYN_COOKIES
19422060 static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
....@@ -1977,7 +2095,7 @@
19772095 {
19782096 const struct sk_buff *skb = tcp_rtx_queue_head(sk);
19792097 u32 rto = inet_csk(sk)->icsk_rto;
1980
- u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
2098
+ u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);
19812099
19822100 return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
19832101 }
....@@ -2077,34 +2195,72 @@
20772195 #define TCP_ULP_MAX 128
20782196 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX)
20792197
2080
-enum {
2081
- TCP_ULP_TLS,
2082
- TCP_ULP_BPF,
2083
-};
2084
-
20852198 struct tcp_ulp_ops {
20862199 struct list_head list;
20872200
20882201 /* initialize ulp */
20892202 int (*init)(struct sock *sk);
2203
+ /* update ulp */
2204
+ void (*update)(struct sock *sk, struct proto *p,
2205
+ void (*write_space)(struct sock *sk));
20902206 /* cleanup ulp */
20912207 void (*release)(struct sock *sk);
2208
+ /* diagnostic */
2209
+ int (*get_info)(const struct sock *sk, struct sk_buff *skb);
2210
+ size_t (*get_info_size)(const struct sock *sk);
2211
+ /* clone ulp */
2212
+ void (*clone)(const struct request_sock *req, struct sock *newsk,
2213
+ const gfp_t priority);
20922214
2093
- int uid;
20942215 char name[TCP_ULP_NAME_MAX];
2095
- bool user_visible;
20962216 struct module *owner;
20972217 };
20982218 int tcp_register_ulp(struct tcp_ulp_ops *type);
20992219 void tcp_unregister_ulp(struct tcp_ulp_ops *type);
21002220 int tcp_set_ulp(struct sock *sk, const char *name);
2101
-int tcp_set_ulp_id(struct sock *sk, const int ulp);
21022221 void tcp_get_available_ulp(char *buf, size_t len);
21032222 void tcp_cleanup_ulp(struct sock *sk);
2223
+void tcp_update_ulp(struct sock *sk, struct proto *p,
2224
+ void (*write_space)(struct sock *sk));
21042225
21052226 #define MODULE_ALIAS_TCP_ULP(name) \
21062227 __MODULE_INFO(alias, alias_userspace, name); \
21072228 __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
2229
+
2230
+struct sk_msg;
2231
+struct sk_psock;
2232
+
2233
+#ifdef CONFIG_BPF_STREAM_PARSER
2234
+struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
2235
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
2236
+#else
2237
+static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
2238
+{
2239
+}
2240
+#endif /* CONFIG_BPF_STREAM_PARSER */
2241
+
2242
+#ifdef CONFIG_NET_SOCK_MSG
2243
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
2244
+ int flags);
2245
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
2246
+ struct msghdr *msg, int len, int flags);
2247
+#endif /* CONFIG_NET_SOCK_MSG */
2248
+
2249
+#ifdef CONFIG_CGROUP_BPF
2250
+static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
2251
+ struct sk_buff *skb,
2252
+ unsigned int end_offset)
2253
+{
2254
+ skops->skb = skb;
2255
+ skops->skb_data_end = skb->data + end_offset;
2256
+}
2257
+#else
2258
+static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
2259
+ struct sk_buff *skb,
2260
+ unsigned int end_offset)
2261
+{
2262
+}
2263
+#endif
21082264
21092265 /* Call BPF_SOCK_OPS program that returns an int. If the return value
21102266 * is < 0, then the BPF op failed (for example if the loaded BPF
....@@ -2197,6 +2353,12 @@
21972353 return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
21982354 }
21992355
2356
+static inline void tcp_bpf_rtt(struct sock *sk)
2357
+{
2358
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
2359
+ tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
2360
+}
2361
+
22002362 #if IS_ENABLED(CONFIG_SMC)
22012363 extern struct static_key_false tcp_have_smc;
22022364 #endif
....@@ -2205,7 +2367,29 @@
22052367 void clean_acked_data_enable(struct inet_connection_sock *icsk,
22062368 void (*cad)(struct sock *sk, u32 ack_seq));
22072369 void clean_acked_data_disable(struct inet_connection_sock *icsk);
2208
-
2370
+void clean_acked_data_flush(void);
22092371 #endif
22102372
2373
+DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2374
+static inline void tcp_add_tx_delay(struct sk_buff *skb,
2375
+ const struct tcp_sock *tp)
2376
+{
2377
+ if (static_branch_unlikely(&tcp_tx_delay_enabled))
2378
+ skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
2379
+}
2380
+
2381
+/* Compute Earliest Departure Time for some control packets
2382
+ * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
2383
+ */
2384
+static inline u64 tcp_transmit_time(const struct sock *sk)
2385
+{
2386
+ if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
2387
+ u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
2388
+ tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
2389
+
2390
+ return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
2391
+ }
2392
+ return 0;
2393
+}
2394
+
22112395 #endif /* _TCP_H */