hc
2024-05-10 cde9070d9970eef1f7ec2360586c802a16230ad8
kernel/net/ipv4/tcp_minisocks.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -179,7 +180,7 @@
179180 * Oh well... nobody has a sufficient solution to this
180181 * protocol bug yet.
181182 */
182
- if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) {
183
+ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
183184 kill:
184185 inet_twsk_deschedule_put(tw);
185186 return TCP_TW_SUCCESS;
....@@ -265,6 +266,7 @@
265266
266267 tw->tw_transparent = inet->transparent;
267268 tw->tw_mark = sk->sk_mark;
269
+ tw->tw_priority = sk->sk_priority;
268270 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
269271 tcptw->tw_rcv_nxt = tp->rcv_nxt;
270272 tcptw->tw_snd_nxt = tp->snd_nxt;
....@@ -273,7 +275,7 @@
273275 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
274276 tcptw->tw_ts_offset = tp->tsoffset;
275277 tcptw->tw_last_oow_ack_time = 0;
276
-
278
+ tcptw->tw_tx_delay = tp->tcp_tx_delay;
277279 #if IS_ENABLED(CONFIG_IPV6)
278280 if (tw->tw_family == PF_INET6) {
279281 struct ipv6_pinfo *np = inet6_sk(sk);
....@@ -282,6 +284,7 @@
282284 tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
283285 tw->tw_tclass = np->tclass;
284286 tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
287
+ tw->tw_txhash = sk->sk_txhash;
285288 tw->tw_ipv6only = sk->sk_ipv6only;
286289 }
287290 #endif
....@@ -294,12 +297,15 @@
294297 * so the timewait ack generating code has the key.
295298 */
296299 do {
297
- struct tcp_md5sig_key *key;
298300 tcptw->tw_md5_key = NULL;
299
- key = tp->af_specific->md5_lookup(sk, sk);
300
- if (key) {
301
- tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
302
- BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
301
+ if (static_branch_unlikely(&tcp_md5_needed)) {
302
+ struct tcp_md5sig_key *key;
303
+
304
+ key = tp->af_specific->md5_lookup(sk, sk);
305
+ if (key) {
306
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
307
+ BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
308
+ }
303309 }
304310 } while (0);
305311 #endif
....@@ -338,10 +344,12 @@
338344 void tcp_twsk_destructor(struct sock *sk)
339345 {
340346 #ifdef CONFIG_TCP_MD5SIG
341
- struct tcp_timewait_sock *twsk = tcp_twsk(sk);
347
+ if (static_branch_unlikely(&tcp_md5_needed)) {
348
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
342349
343
- if (twsk->tw_md5_key)
344
- kfree_rcu(twsk->tw_md5_key, rcu);
350
+ if (twsk->tw_md5_key)
351
+ kfree_rcu(twsk->tw_md5_key, rcu);
352
+ }
345353 #endif
346354 }
347355 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
....@@ -406,7 +414,7 @@
406414
407415 rcu_read_lock();
408416 ca = tcp_ca_find_key(ca_key);
409
- if (likely(ca && try_module_get(ca->owner))) {
417
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
410418 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
411419 icsk->icsk_ca_ops = ca;
412420 ca_got_dst = true;
....@@ -417,7 +425,7 @@
417425 /* If no valid choice made yet, assign current system default ca. */
418426 if (!ca_got_dst &&
419427 (!icsk->icsk_ca_setsockopt ||
420
- !try_module_get(icsk->icsk_ca_ops->owner)))
428
+ !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
421429 tcp_assign_congestion_control(sk);
422430
423431 tcp_set_ca_state(sk, TCP_CA_Open);
....@@ -474,50 +482,25 @@
474482 WRITE_ONCE(newtp->rcv_nxt, seq);
475483 newtp->segs_in = 1;
476484
477
- newtp->snd_sml = newtp->snd_una =
478
- newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
485
+ seq = treq->snt_isn + 1;
486
+ newtp->snd_sml = newtp->snd_una = seq;
487
+ WRITE_ONCE(newtp->snd_nxt, seq);
488
+ newtp->snd_up = seq;
479489
480490 INIT_LIST_HEAD(&newtp->tsq_node);
481491 INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
482492
483493 tcp_init_wl(newtp, treq->rcv_isn);
484494
485
- newtp->srtt_us = 0;
486
- newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
487495 minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
488
- newicsk->icsk_rto = TCP_TIMEOUT_INIT;
489496 newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
490497
491
- newtp->packets_out = 0;
492
- newtp->retrans_out = 0;
493
- newtp->sacked_out = 0;
494
- newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
495
- newtp->tlp_high_seq = 0;
496498 newtp->lsndtime = tcp_jiffies32;
497499 newsk->sk_txhash = treq->txhash;
498
- newtp->last_oow_ack_time = 0;
499500 newtp->total_retrans = req->num_retrans;
500
-
501
- /* So many TCP implementations out there (incorrectly) count the
502
- * initial SYN frame in their delayed-ACK and congestion control
503
- * algorithms that we must have the following bandaid to talk
504
- * efficiently to them. -DaveM
505
- */
506
- newtp->snd_cwnd = TCP_INIT_CWND;
507
- newtp->snd_cwnd_cnt = 0;
508
-
509
- /* There's a bubble in the pipe until at least the first ACK. */
510
- newtp->app_limited = ~0U;
511501
512502 tcp_init_xmit_timers(newsk);
513503 WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
514
-
515
- newtp->rx_opt.saw_tstamp = 0;
516
-
517
- newtp->rx_opt.dsack = 0;
518
- newtp->rx_opt.num_sacks = 0;
519
-
520
- newtp->urg_data = 0;
521504
522505 if (sock_flag(newsk, SOCK_KEEPOPEN))
523506 inet_csk_reset_keepalive_timer(newsk,
....@@ -540,17 +523,22 @@
540523 newtp->max_window = newtp->snd_wnd;
541524
542525 if (newtp->rx_opt.tstamp_ok) {
543
- newtp->rx_opt.ts_recent = req->ts_recent;
526
+ newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
544527 newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
545528 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
546529 } else {
547530 newtp->rx_opt.ts_recent_stamp = 0;
548531 newtp->tcp_header_len = sizeof(struct tcphdr);
549532 }
533
+ if (req->num_timeout) {
534
+ newtp->undo_marker = treq->snt_isn;
535
+ newtp->retrans_stamp = div_u64(treq->snt_synack,
536
+ USEC_PER_SEC / TCP_TS_HZ);
537
+ }
550538 newtp->tsoffset = treq->ts_off;
551539 #ifdef CONFIG_TCP_MD5SIG
552540 newtp->md5sig_info = NULL; /*XXX*/
553
- if (newtp->af_specific->md5_lookup(sk, newsk))
541
+ if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req)))
554542 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
555543 #endif
556544 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
....@@ -558,14 +546,9 @@
558546 newtp->rx_opt.mss_clamp = req->mss;
559547 tcp_ecn_openreq_child(newtp, req);
560548 newtp->fastopen_req = NULL;
561
- newtp->fastopen_rsk = NULL;
562
- newtp->syn_data_acked = 0;
563
- newtp->rack.mstamp = 0;
564
- newtp->rack.advanced = 0;
565
- newtp->rack.reo_wnd_steps = 1;
566
- newtp->rack.last_delivered = 0;
567
- newtp->rack.reo_wnd_persist = 0;
568
- newtp->rack.dsack_seen = 0;
549
+ RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
550
+
551
+ tcp_bpf_clone(sk, newsk);
569552
570553 __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
571554
....@@ -582,6 +565,9 @@
582565 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
583566 *
584567 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
568
+ *
569
+ * Note: If @fastopen is true, this can be called from process context.
570
+ * Otherwise, this is from BH context.
585571 */
586572
587573 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
....@@ -600,7 +586,7 @@
600586 tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
601587
602588 if (tmp_opt.saw_tstamp) {
603
- tmp_opt.ts_recent = req->ts_recent;
589
+ tmp_opt.ts_recent = READ_ONCE(req->ts_recent);
604590 if (tmp_opt.rcv_tsecr)
605591 tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
606592 /* We do not store true stamp, but it is not required,
....@@ -734,14 +720,17 @@
734720 &tcp_rsk(req)->last_oow_ack_time))
735721 req->rsk_ops->send_ack(sk, skb, req);
736722 if (paws_reject)
737
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
723
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
738724 return NULL;
739725 }
740726
741727 /* In sequence, PAWS is OK. */
742728
729
+ /* TODO: We probably should defer ts_recent change once
730
+ * we take ownership of @req.
731
+ */
743732 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
744
- req->ts_recent = tmp_opt.rcv_tsval;
733
+ WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval);
745734
746735 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
747736 /* Truncate SYN, it is out of window starting
....@@ -753,7 +742,7 @@
753742 * "fourth, check the SYN bit"
754743 */
755744 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
756
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
745
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
757746 goto embryonic_reset;
758747 }
759748
....@@ -790,6 +779,12 @@
790779 req, &own_req);
791780 if (!child)
792781 goto listen_overflow;
782
+
783
+ if (own_req && rsk_drop_req(req)) {
784
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
785
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
786
+ return child;
787
+ }
793788
794789 sock_rps_save_rxhash(child, skb);
795790 tcp_synack_rtt_meas(child, req);
....@@ -839,6 +834,7 @@
839834
840835 int tcp_child_process(struct sock *parent, struct sock *child,
841836 struct sk_buff *skb)
837
+ __releases(&((child)->sk_lock.slock))
842838 {
843839 int ret = 0;
844840 int state = child->sk_state;