hc
2024-09-20 a36159eec6ca17402b0e146b86efaf76568dc353
kernel/net/packet/af_packet.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -43,13 +44,6 @@
4344 * Chetan Loke : Implemented TPACKET_V3 block abstraction
4445 * layer.
4546 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46
- *
47
- *
48
- * This program is free software; you can redistribute it and/or
49
- * modify it under the terms of the GNU General Public License
50
- * as published by the Free Software Foundation; either version
51
- * 2 of the License, or (at your option) any later version.
52
- *
5347 */
5448
5549 #include <linux/types.h>
....@@ -63,7 +57,6 @@
6357 #include <linux/if_packet.h>
6458 #include <linux/wireless.h>
6559 #include <linux/kernel.h>
66
-#include <linux/delay.h>
6760 #include <linux/kmod.h>
6861 #include <linux/slab.h>
6962 #include <linux/vmalloc.h>
....@@ -100,52 +93,56 @@
10093
10194 /*
10295 Assumptions:
103
- - if device has no dev->hard_header routine, it adds and removes ll header
104
- inside itself. In this case ll header is invisible outside of device,
105
- but higher levels still should reserve dev->hard_header_len.
106
- Some devices are enough clever to reallocate skb, when header
107
- will not fit to reserved space (tunnel), another ones are silly
108
- (PPP).
96
+ - If the device has no dev->header_ops->create, there is no LL header
97
+ visible above the device. In this case, its hard_header_len should be 0.
98
+ The device may prepend its own header internally. In this case, its
99
+ needed_headroom should be set to the space needed for it to add its
100
+ internal header.
101
+ For example, a WiFi driver pretending to be an Ethernet driver should
102
+ set its hard_header_len to be the Ethernet header length, and set its
103
+ needed_headroom to be (the real WiFi header length - the fake Ethernet
104
+ header length).
109105 - packet socket receives packets with pulled ll header,
110106 so that SOCK_RAW should push it back.
111107
112108 On receive:
113109 -----------
114110
115
-Incoming, dev->hard_header!=NULL
111
+Incoming, dev_has_header(dev) == true
116112 mac_header -> ll header
117113 data -> data
118114
119
-Outgoing, dev->hard_header!=NULL
115
+Outgoing, dev_has_header(dev) == true
120116 mac_header -> ll header
121117 data -> ll header
122118
123
-Incoming, dev->hard_header==NULL
124
- mac_header -> UNKNOWN position. It is very likely, that it points to ll
125
- header. PPP makes it, that is wrong, because introduce
126
- assymetry between rx and tx paths.
119
+Incoming, dev_has_header(dev) == false
120
+ mac_header -> data
121
+ However drivers often make it point to the ll header.
122
+ This is incorrect because the ll header should be invisible to us.
127123 data -> data
128124
129
-Outgoing, dev->hard_header==NULL
130
- mac_header -> data. ll header is still not built!
125
+Outgoing, dev_has_header(dev) == false
126
+ mac_header -> data. ll header is invisible to us.
131127 data -> data
132128
133129 Resume
134
- If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
+ If dev_has_header(dev) == false we are unable to restore the ll header,
131
+ because it is invisible to us.
135132
136133
137134 On transmit:
138135 ------------
139136
140
-dev->hard_header != NULL
137
+dev->header_ops != NULL
141138 mac_header -> ll header
142139 data -> ll header
143140
144
-dev->hard_header == NULL (ll header is added by device, we cannot control it)
141
+dev->header_ops == NULL (ll header is invisible to us)
145142 mac_header -> data
146143 data -> data
147144
148
- We should set nh.raw on output to correct posistion,
145
+ We should set network_header on output to the correct position,
149146 packet classifier depends on it.
150147 */
151148
....@@ -184,7 +181,6 @@
184181 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185182 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186183 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187
-#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188184
189185 struct packet_sock;
190186 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
....@@ -273,27 +269,26 @@
273269
274270 static bool packet_use_direct_xmit(const struct packet_sock *po)
275271 {
276
- return po->xmit == packet_direct_xmit;
277
-}
278
-
279
-static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
280
- struct net_device *sb_dev)
281
-{
282
- return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
272
+ /* Paired with WRITE_ONCE() in packet_setsockopt() */
273
+ return READ_ONCE(po->xmit) == packet_direct_xmit;
283274 }
284275
285276 static u16 packet_pick_tx_queue(struct sk_buff *skb)
286277 {
287278 struct net_device *dev = skb->dev;
288279 const struct net_device_ops *ops = dev->netdev_ops;
280
+ int cpu = raw_smp_processor_id();
289281 u16 queue_index;
290282
283
+#ifdef CONFIG_XPS
284
+ skb->sender_cpu = cpu + 1;
285
+#endif
286
+ skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
291287 if (ops->ndo_select_queue) {
292
- queue_index = ops->ndo_select_queue(dev, skb, NULL,
293
- __packet_pick_tx_queue);
288
+ queue_index = ops->ndo_select_queue(dev, skb, NULL);
294289 queue_index = netdev_cap_txqueue(dev, queue_index);
295290 } else {
296
- queue_index = __packet_pick_tx_queue(dev, skb, NULL);
291
+ queue_index = netdev_pick_tx(dev, skb, NULL);
297292 }
298293
299294 return queue_index;
....@@ -371,18 +366,20 @@
371366 {
372367 union tpacket_uhdr h;
373368
369
+ /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
370
+
374371 h.raw = frame;
375372 switch (po->tp_version) {
376373 case TPACKET_V1:
377
- h.h1->tp_status = status;
374
+ WRITE_ONCE(h.h1->tp_status, status);
378375 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
379376 break;
380377 case TPACKET_V2:
381
- h.h2->tp_status = status;
378
+ WRITE_ONCE(h.h2->tp_status, status);
382379 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
383380 break;
384381 case TPACKET_V3:
385
- h.h3->tp_status = status;
382
+ WRITE_ONCE(h.h3->tp_status, status);
386383 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
387384 break;
388385 default:
....@@ -393,23 +390,25 @@
393390 smp_wmb();
394391 }
395392
396
-static int __packet_get_status(struct packet_sock *po, void *frame)
393
+static int __packet_get_status(const struct packet_sock *po, void *frame)
397394 {
398395 union tpacket_uhdr h;
399396
400397 smp_rmb();
401398
399
+ /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
400
+
402401 h.raw = frame;
403402 switch (po->tp_version) {
404403 case TPACKET_V1:
405404 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
406
- return h.h1->tp_status;
405
+ return READ_ONCE(h.h1->tp_status);
407406 case TPACKET_V2:
408407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
409
- return h.h2->tp_status;
408
+ return READ_ONCE(h.h2->tp_status);
410409 case TPACKET_V3:
411410 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
412
- return h.h3->tp_status;
411
+ return READ_ONCE(h.h3->tp_status);
413412 default:
414413 WARN(1, "TPACKET version not supported.\n");
415414 BUG();
....@@ -417,17 +416,18 @@
417416 }
418417 }
419418
420
-static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
419
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
421420 unsigned int flags)
422421 {
423422 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
424423
425424 if (shhwtstamps &&
426425 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
427
- ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
426
+ ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
428427 return TP_STATUS_TS_RAW_HARDWARE;
429428
430
- if (ktime_to_timespec_cond(skb->tstamp, ts))
429
+ if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
430
+ ktime_to_timespec64_cond(skb->tstamp, ts))
431431 return TP_STATUS_TS_SOFTWARE;
432432
433433 return 0;
....@@ -437,13 +437,20 @@
437437 struct sk_buff *skb)
438438 {
439439 union tpacket_uhdr h;
440
- struct timespec ts;
440
+ struct timespec64 ts;
441441 __u32 ts_status;
442442
443443 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
444444 return 0;
445445
446446 h.raw = frame;
447
+ /*
448
+ * versions 1 through 3 overflow the timestamps in y2106, since they
449
+ * all store the seconds in a 32-bit unsigned integer.
450
+ * If we create a version 4, that should have a 64-bit timestamp,
451
+ * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
452
+ * nanoseconds.
453
+ */
447454 switch (po->tp_version) {
448455 case TPACKET_V1:
449456 h.h1->tp_sec = ts.tv_sec;
....@@ -469,10 +476,10 @@
469476 return ts_status;
470477 }
471478
472
-static void *packet_lookup_frame(struct packet_sock *po,
473
- struct packet_ring_buffer *rb,
474
- unsigned int position,
475
- int status)
479
+static void *packet_lookup_frame(const struct packet_sock *po,
480
+ const struct packet_ring_buffer *rb,
481
+ unsigned int position,
482
+ int status)
476483 {
477484 unsigned int pg_vec_pos, frame_offset;
478485 union tpacket_uhdr h;
....@@ -529,7 +536,7 @@
529536 int blk_size_in_bytes)
530537 {
531538 struct net_device *dev;
532
- unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
539
+ unsigned int mbits, div;
533540 struct ethtool_link_ksettings ecmd;
534541 int err;
535542
....@@ -541,31 +548,25 @@
541548 }
542549 err = __ethtool_get_link_ksettings(dev, &ecmd);
543550 rtnl_unlock();
544
- if (!err) {
545
- /*
546
- * If the link speed is so slow you don't really
547
- * need to worry about perf anyways
548
- */
549
- if (ecmd.base.speed < SPEED_1000 ||
550
- ecmd.base.speed == SPEED_UNKNOWN) {
551
- return DEFAULT_PRB_RETIRE_TOV;
552
- } else {
553
- msec = 1;
554
- div = ecmd.base.speed / 1000;
555
- }
556
- } else
551
+ if (err)
557552 return DEFAULT_PRB_RETIRE_TOV;
558553
554
+ /* If the link speed is so slow you don't really
555
+ * need to worry about perf anyways
556
+ */
557
+ if (ecmd.base.speed < SPEED_1000 ||
558
+ ecmd.base.speed == SPEED_UNKNOWN)
559
+ return DEFAULT_PRB_RETIRE_TOV;
560
+
561
+ div = ecmd.base.speed / 1000;
559562 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
560563
561564 if (div)
562565 mbits /= div;
563566
564
- tmo = mbits * msec;
565
-
566567 if (div)
567
- return tmo+1;
568
- return tmo;
568
+ return mbits + 1;
569
+ return mbits;
569570 }
570571
571572 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
....@@ -601,6 +602,7 @@
601602 req_u->req3.tp_block_size);
602603 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
603604 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
605
+ rwlock_init(&p1->blk_fill_in_prog_lock);
604606
605607 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
606608 prb_init_ft_ops(p1, req_u);
....@@ -667,10 +669,9 @@
667669 *
668670 */
669671 if (BLOCK_NUM_PKTS(pbd)) {
670
- while (atomic_read(&pkc->blk_fill_in_prog)) {
671
- /* Waiting for skb_copy_bits to finish... */
672
- cpu_chill();
673
- }
672
+ /* Waiting for skb_copy_bits to finish... */
673
+ write_lock(&pkc->blk_fill_in_prog_lock);
674
+ write_unlock(&pkc->blk_fill_in_prog_lock);
674675 }
675676
676677 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
....@@ -768,7 +769,7 @@
768769 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
769770 struct sock *sk = &po->sk;
770771
771
- if (po->stats.stats3.tp_drops)
772
+ if (atomic_read(&po->tp_drops))
772773 status |= TP_STATUS_LOSING;
773774
774775 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
....@@ -784,8 +785,8 @@
784785 * It shouldn't really happen as we don't close empty
785786 * blocks. See prb_retire_rx_blk_timer_expired().
786787 */
787
- struct timespec ts;
788
- getnstimeofday(&ts);
788
+ struct timespec64 ts;
789
+ ktime_get_real_ts64(&ts);
789790 h1->ts_last_pkt.ts_sec = ts.tv_sec;
790791 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
791792 }
....@@ -815,7 +816,7 @@
815816 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
816817 struct tpacket_block_desc *pbd1)
817818 {
818
- struct timespec ts;
819
+ struct timespec64 ts;
819820 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
820821
821822 smp_rmb();
....@@ -828,7 +829,7 @@
828829 BLOCK_NUM_PKTS(pbd1) = 0;
829830 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
830831
831
- getnstimeofday(&ts);
832
+ ktime_get_real_ts64(&ts);
832833
833834 h1->ts_first_pkt.ts_sec = ts.tv_sec;
834835 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
....@@ -929,10 +930,9 @@
929930 * the timer-handler already handled this case.
930931 */
931932 if (!(status & TP_STATUS_BLK_TMO)) {
932
- while (atomic_read(&pkc->blk_fill_in_prog)) {
933
- /* Waiting for skb_copy_bits to finish... */
934
- cpu_chill();
935
- }
933
+ /* Waiting for skb_copy_bits to finish... */
934
+ write_lock(&pkc->blk_fill_in_prog_lock);
935
+ write_unlock(&pkc->blk_fill_in_prog_lock);
936936 }
937937 prb_close_block(pkc, pbd, po, status);
938938 return;
....@@ -953,7 +953,8 @@
953953 __releases(&pkc->blk_fill_in_prog_lock)
954954 {
955955 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
956
- atomic_dec(&pkc->blk_fill_in_prog);
956
+
957
+ read_unlock(&pkc->blk_fill_in_prog_lock);
957958 }
958959
959960 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
....@@ -1008,14 +1009,13 @@
10081009 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
10091010 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
10101011 BLOCK_NUM_PKTS(pbd) += 1;
1011
- atomic_inc(&pkc->blk_fill_in_prog);
1012
+ read_lock(&pkc->blk_fill_in_prog_lock);
10121013 prb_run_all_ft_ops(pkc, ppd);
10131014 }
10141015
10151016 /* Assumes caller has the sk->rx_queue.lock */
10161017 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
10171018 struct sk_buff *skb,
1018
- int status,
10191019 unsigned int len
10201020 )
10211021 {
....@@ -1087,7 +1087,7 @@
10871087 po->rx_ring.head, status);
10881088 return curr;
10891089 case TPACKET_V3:
1090
- return __packet_lookup_frame_in_block(po, skb, status, len);
1090
+ return __packet_lookup_frame_in_block(po, skb, len);
10911091 default:
10921092 WARN(1, "TPACKET version not supported\n");
10931093 BUG();
....@@ -1095,10 +1095,10 @@
10951095 }
10961096 }
10971097
1098
-static void *prb_lookup_block(struct packet_sock *po,
1099
- struct packet_ring_buffer *rb,
1100
- unsigned int idx,
1101
- int status)
1098
+static void *prb_lookup_block(const struct packet_sock *po,
1099
+ const struct packet_ring_buffer *rb,
1100
+ unsigned int idx,
1101
+ int status)
11021102 {
11031103 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
11041104 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
....@@ -1211,12 +1211,12 @@
12111211 #define ROOM_LOW 0x1
12121212 #define ROOM_NORMAL 0x2
12131213
1214
-static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1214
+static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
12151215 {
12161216 int idx, len;
12171217
1218
- len = po->rx_ring.frame_max + 1;
1219
- idx = po->rx_ring.head;
1218
+ len = READ_ONCE(po->rx_ring.frame_max) + 1;
1219
+ idx = READ_ONCE(po->rx_ring.head);
12201220 if (pow_off)
12211221 idx += len >> pow_off;
12221222 if (idx >= len)
....@@ -1224,12 +1224,12 @@
12241224 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
12251225 }
12261226
1227
-static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1227
+static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
12281228 {
12291229 int idx, len;
12301230
1231
- len = po->rx_ring.prb_bdqc.knum_blocks;
1232
- idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1231
+ len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1232
+ idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
12331233 if (pow_off)
12341234 idx += len >> pow_off;
12351235 if (idx >= len)
....@@ -1237,15 +1237,18 @@
12371237 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
12381238 }
12391239
1240
-static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1240
+static int __packet_rcv_has_room(const struct packet_sock *po,
1241
+ const struct sk_buff *skb)
12411242 {
1242
- struct sock *sk = &po->sk;
1243
+ const struct sock *sk = &po->sk;
12431244 int ret = ROOM_NONE;
12441245
12451246 if (po->prot_hook.func != tpacket_rcv) {
1246
- int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1247
- - (skb ? skb->truesize : 0);
1248
- if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1247
+ int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1248
+ int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1249
+ - (skb ? skb->truesize : 0);
1250
+
1251
+ if (avail > (rcvbuf >> ROOM_POW_OFF))
12491252 return ROOM_NORMAL;
12501253 else if (avail > 0)
12511254 return ROOM_LOW;
....@@ -1270,17 +1273,22 @@
12701273
12711274 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
12721275 {
1273
- int ret;
1274
- bool has_room;
1276
+ int pressure, ret;
12751277
1276
- spin_lock_bh(&po->sk.sk_receive_queue.lock);
12771278 ret = __packet_rcv_has_room(po, skb);
1278
- has_room = ret == ROOM_NORMAL;
1279
- if (po->pressure == has_room)
1280
- po->pressure = !has_room;
1281
- spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1279
+ pressure = ret != ROOM_NORMAL;
1280
+
1281
+ if (READ_ONCE(po->pressure) != pressure)
1282
+ WRITE_ONCE(po->pressure, pressure);
12821283
12831284 return ret;
1285
+}
1286
+
1287
+static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1288
+{
1289
+ if (READ_ONCE(po->pressure) &&
1290
+ __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1291
+ WRITE_ONCE(po->pressure, 0);
12841292 }
12851293
12861294 static void packet_sock_destruct(struct sock *sk)
....@@ -1356,7 +1364,7 @@
13561364 struct packet_sock *po, *po_next, *po_skip = NULL;
13571365 unsigned int i, j, room = ROOM_NONE;
13581366
1359
- po = pkt_sk(f->arr[idx]);
1367
+ po = pkt_sk(rcu_dereference(f->arr[idx]));
13601368
13611369 if (try_self) {
13621370 room = packet_rcv_has_room(po, skb);
....@@ -1368,8 +1376,8 @@
13681376
13691377 i = j = min_t(int, po->rollover->sock, num - 1);
13701378 do {
1371
- po_next = pkt_sk(f->arr[i]);
1372
- if (po_next != po_skip && !po_next->pressure &&
1379
+ po_next = pkt_sk(rcu_dereference(f->arr[i]));
1380
+ if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
13731381 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
13741382 if (i != j)
13751383 po->rollover->sock = i;
....@@ -1463,7 +1471,7 @@
14631471 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
14641472 idx = fanout_demux_rollover(f, skb, idx, true, num);
14651473
1466
- po = pkt_sk(f->arr[idx]);
1474
+ po = pkt_sk(rcu_dereference(f->arr[idx]));
14671475 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
14681476 }
14691477
....@@ -1477,7 +1485,7 @@
14771485 struct packet_fanout *f = po->fanout;
14781486
14791487 spin_lock(&f->lock);
1480
- f->arr[f->num_members] = sk;
1488
+ rcu_assign_pointer(f->arr[f->num_members], sk);
14811489 smp_wmb();
14821490 f->num_members++;
14831491 if (f->num_members == 1)
....@@ -1492,11 +1500,14 @@
14921500
14931501 spin_lock(&f->lock);
14941502 for (i = 0; i < f->num_members; i++) {
1495
- if (f->arr[i] == sk)
1503
+ if (rcu_dereference_protected(f->arr[i],
1504
+ lockdep_is_held(&f->lock)) == sk)
14961505 break;
14971506 }
14981507 BUG_ON(i >= f->num_members);
1499
- f->arr[i] = f->arr[f->num_members - 1];
1508
+ rcu_assign_pointer(f->arr[i],
1509
+ rcu_dereference_protected(f->arr[f->num_members - 1],
1510
+ lockdep_is_held(&f->lock)));
15001511 f->num_members--;
15011512 if (f->num_members == 0)
15021513 __dev_remove_pack(&f->prot_hook);
....@@ -1539,7 +1550,7 @@
15391550 }
15401551 }
15411552
1542
-static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1553
+static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
15431554 unsigned int len)
15441555 {
15451556 struct bpf_prog *new;
....@@ -1548,10 +1559,10 @@
15481559
15491560 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
15501561 return -EPERM;
1551
- if (len != sizeof(fprog))
1552
- return -EINVAL;
1553
- if (copy_from_user(&fprog, data, len))
1554
- return -EFAULT;
1562
+
1563
+ ret = copy_bpf_fprog_from_user(&fprog, data, len);
1564
+ if (ret)
1565
+ return ret;
15551566
15561567 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
15571568 if (ret)
....@@ -1561,7 +1572,7 @@
15611572 return 0;
15621573 }
15631574
1564
-static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1575
+static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
15651576 unsigned int len)
15661577 {
15671578 struct bpf_prog *new;
....@@ -1571,7 +1582,7 @@
15711582 return -EPERM;
15721583 if (len != sizeof(fd))
15731584 return -EINVAL;
1574
- if (copy_from_user(&fd, data, len))
1585
+ if (copy_from_sockptr(&fd, data, len))
15751586 return -EFAULT;
15761587
15771588 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
....@@ -1582,7 +1593,7 @@
15821593 return 0;
15831594 }
15841595
1585
-static int fanout_set_data(struct packet_sock *po, char __user *data,
1596
+static int fanout_set_data(struct packet_sock *po, sockptr_t data,
15861597 unsigned int len)
15871598 {
15881599 switch (po->fanout->type) {
....@@ -1634,13 +1645,15 @@
16341645 return false;
16351646 }
16361647
1637
-static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1648
+static int fanout_add(struct sock *sk, struct fanout_args *args)
16381649 {
16391650 struct packet_rollover *rollover = NULL;
16401651 struct packet_sock *po = pkt_sk(sk);
1652
+ u16 type_flags = args->type_flags;
16411653 struct packet_fanout *f, *match;
16421654 u8 type = type_flags & 0xff;
16431655 u8 flags = type_flags >> 8;
1656
+ u16 id = args->id;
16441657 int err;
16451658
16461659 switch (type) {
....@@ -1698,11 +1711,21 @@
16981711 }
16991712 }
17001713 err = -EINVAL;
1701
- if (match && match->flags != flags)
1702
- goto out;
1703
- if (!match) {
1714
+ if (match) {
1715
+ if (match->flags != flags)
1716
+ goto out;
1717
+ if (args->max_num_members &&
1718
+ args->max_num_members != match->max_num_members)
1719
+ goto out;
1720
+ } else {
1721
+ if (args->max_num_members > PACKET_FANOUT_MAX)
1722
+ goto out;
1723
+ if (!args->max_num_members)
1724
+ /* legacy PACKET_FANOUT_MAX */
1725
+ args->max_num_members = 256;
17041726 err = -ENOMEM;
1705
- match = kzalloc(sizeof(*match), GFP_KERNEL);
1727
+ match = kvzalloc(struct_size(match, arr, args->max_num_members),
1728
+ GFP_KERNEL);
17061729 if (!match)
17071730 goto out;
17081731 write_pnet(&match->net, sock_net(sk));
....@@ -1719,6 +1742,7 @@
17191742 match->prot_hook.af_packet_priv = match;
17201743 match->prot_hook.af_packet_net = read_pnet(&match->net);
17211744 match->prot_hook.id_match = match_fanout_group;
1745
+ match->max_num_members = args->max_num_members;
17221746 list_add(&match->list, &fanout_list);
17231747 }
17241748 err = -EINVAL;
....@@ -1729,7 +1753,7 @@
17291753 match->prot_hook.type == po->prot_hook.type &&
17301754 match->prot_hook.dev == po->prot_hook.dev) {
17311755 err = -ENOSPC;
1732
- if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1756
+ if (refcount_read(&match->sk_ref) < match->max_num_members) {
17331757 __dev_remove_pack(&po->prot_hook);
17341758
17351759 /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
....@@ -1746,7 +1770,7 @@
17461770
17471771 if (err && !refcount_read(&match->sk_ref)) {
17481772 list_del(&match->list);
1749
- kfree(match);
1773
+ kvfree(match);
17501774 }
17511775
17521776 out:
....@@ -1836,7 +1860,7 @@
18361860 skb_dst_drop(skb);
18371861
18381862 /* drop conntrack reference */
1839
- nf_reset(skb);
1863
+ nf_reset_ct(skb);
18401864
18411865 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
18421866
....@@ -1864,6 +1888,24 @@
18641888 return 0;
18651889 }
18661890
1891
+static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1892
+{
1893
+ int depth;
1894
+
1895
+ if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1896
+ sock->type == SOCK_RAW) {
1897
+ skb_reset_mac_header(skb);
1898
+ skb->protocol = dev_parse_header_protocol(skb);
1899
+ }
1900
+
1901
+ /* Move network header to the right position for VLAN tagged packets */
1902
+ if (likely(skb->dev->type == ARPHRD_ETHER) &&
1903
+ eth_type_vlan(skb->protocol) &&
1904
+ vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
1905
+ skb_set_network_header(skb, depth);
1906
+
1907
+ skb_probe_transport_header(skb);
1908
+}
18671909
18681910 /*
18691911 * Output a raw packet to a device layer. This bypasses all the other
....@@ -1956,7 +1998,7 @@
19561998 goto retry;
19571999 }
19582000
1959
- if (!dev_validate_header(dev, skb->data, len)) {
2001
+ if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
19602002 err = -EINVAL;
19612003 goto out_unlock;
19622004 }
....@@ -1979,12 +2021,12 @@
19792021 skb->mark = sk->sk_mark;
19802022 skb->tstamp = sockc.transmit_time;
19812023
1982
- sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
2024
+ skb_setup_tx_timestamp(skb, sockc.tsflags);
19832025
19842026 if (unlikely(extra_len == 4))
19852027 skb->no_fcs = 1;
19862028
1987
- skb_probe_transport_header(skb, 0);
2029
+ packet_parse_headers(skb, sock);
19882030
19892031 dev_queue_xmit(skb);
19902032 rcu_read_unlock();
....@@ -2061,7 +2103,7 @@
20612103
20622104 skb->dev = dev;
20632105
2064
- if (dev->header_ops) {
2106
+ if (dev_has_header(dev)) {
20652107 /* The device has an explicit notion of ll header,
20662108 * exported to higher levels.
20672109 *
....@@ -2106,7 +2148,7 @@
21062148 sll = &PACKET_SKB_CB(skb)->sa.ll;
21072149 sll->sll_hatype = dev->type;
21082150 sll->sll_pkttype = skb->pkt_type;
2109
- if (unlikely(po->origdev))
2151
+ if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
21102152 sll->sll_ifindex = orig_dev->ifindex;
21112153 else
21122154 sll->sll_ifindex = dev->ifindex;
....@@ -2126,7 +2168,7 @@
21262168 skb_dst_drop(skb);
21272169
21282170 /* drop conntrack reference */
2129
- nf_reset(skb);
2171
+ nf_reset_ct(skb);
21302172
21312173 spin_lock(&sk->sk_receive_queue.lock);
21322174 po->stats.stats1.tp_packets++;
....@@ -2138,10 +2180,8 @@
21382180
21392181 drop_n_acct:
21402182 is_drop_n_account = true;
2141
- spin_lock(&sk->sk_receive_queue.lock);
2142
- po->stats.stats1.tp_drops++;
2183
+ atomic_inc(&po->tp_drops);
21432184 atomic_inc(&sk->sk_drops);
2144
- spin_unlock(&sk->sk_receive_queue.lock);
21452185
21462186 drop_n_restore:
21472187 if (skb_head != skb->data && skb_shared(skb)) {
....@@ -2170,7 +2210,7 @@
21702210 unsigned short macoff, hdrlen;
21712211 unsigned int netoff;
21722212 struct sk_buff *copy_skb = NULL;
2173
- struct timespec ts;
2213
+ struct timespec64 ts;
21742214 __u32 ts_status;
21752215 bool is_drop_n_account = false;
21762216 unsigned int slot_id = 0;
....@@ -2192,7 +2232,7 @@
21922232 if (!net_eq(dev_net(dev), sock_net(sk)))
21932233 goto drop;
21942234
2195
- if (dev->header_ops) {
2235
+ if (dev_has_header(dev)) {
21962236 if (sk->sk_type != SOCK_DGRAM)
21972237 skb_push(skb, skb->data - skb_mac_header(skb));
21982238 else if (skb->pkt_type == PACKET_OUTGOING) {
....@@ -2207,11 +2247,16 @@
22072247 if (!res)
22082248 goto drop_n_restore;
22092249
2250
+ /* If we are flooded, just give up */
2251
+ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2252
+ atomic_inc(&po->tp_drops);
2253
+ goto drop_n_restore;
2254
+ }
2255
+
22102256 if (skb->ip_summed == CHECKSUM_PARTIAL)
22112257 status |= TP_STATUS_CSUMNOTREADY;
22122258 else if (skb->pkt_type != PACKET_OUTGOING &&
2213
- (skb->ip_summed == CHECKSUM_COMPLETE ||
2214
- skb_csum_unnecessary(skb)))
2259
+ skb_csum_unnecessary(skb))
22152260 status |= TP_STATUS_CSUM_VALID;
22162261
22172262 if (snaplen > res)
....@@ -2232,9 +2277,7 @@
22322277 macoff = netoff - maclen;
22332278 }
22342279 if (netoff > USHRT_MAX) {
2235
- spin_lock(&sk->sk_receive_queue.lock);
2236
- po->stats.stats1.tp_drops++;
2237
- spin_unlock(&sk->sk_receive_queue.lock);
2280
+ atomic_inc(&po->tp_drops);
22382281 goto drop_n_restore;
22392282 }
22402283 if (po->tp_version <= TPACKET_V2) {
....@@ -2247,8 +2290,11 @@
22472290 copy_skb = skb_get(skb);
22482291 skb_head = skb->data;
22492292 }
2250
- if (copy_skb)
2293
+ if (copy_skb) {
2294
+ memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2295
+ sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
22512296 skb_set_owner_r(copy_skb, sk);
2297
+ }
22522298 }
22532299 snaplen = po->rx_ring.frame_size - macoff;
22542300 if ((int)snaplen < 0) {
....@@ -2300,7 +2346,7 @@
23002346 * Anyways, moving it for V1/V2 only as V3 doesn't need this
23012347 * at packet level.
23022348 */
2303
- if (po->stats.stats1.tp_drops)
2349
+ if (atomic_read(&po->tp_drops))
23042350 status |= TP_STATUS_LOSING;
23052351 }
23062352
....@@ -2313,8 +2359,13 @@
23132359
23142360 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
23152361
2316
- if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2317
- getnstimeofday(&ts);
2362
+ /* Always timestamp; prefer an existing software timestamp taken
2363
+ * closer to the time of capture.
2364
+ */
2365
+ ts_status = tpacket_get_timestamp(skb, &ts,
2366
+ po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE);
2367
+ if (!ts_status)
2368
+ ktime_get_real_ts64(&ts);
23182369
23192370 status |= ts_status;
23202371
....@@ -2370,7 +2421,7 @@
23702421 sll->sll_hatype = dev->type;
23712422 sll->sll_protocol = skb->protocol;
23722423 sll->sll_pkttype = skb->pkt_type;
2373
- if (unlikely(po->origdev))
2424
+ if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
23742425 sll->sll_ifindex = orig_dev->ifindex;
23752426 else
23762427 sll->sll_ifindex = dev->ifindex;
....@@ -2413,9 +2464,9 @@
24132464 return 0;
24142465
24152466 drop_n_account:
2416
- is_drop_n_account = true;
2417
- po->stats.stats1.tp_drops++;
24182467 spin_unlock(&sk->sk_receive_queue.lock);
2468
+ atomic_inc(&po->tp_drops);
2469
+ is_drop_n_account = true;
24192470
24202471 sk->sk_data_ready(sk);
24212472 kfree_skb(copy_skb);
....@@ -2441,15 +2492,6 @@
24412492 }
24422493
24432494 sock_wfree(skb);
2444
-}
2445
-
2446
-static void tpacket_set_protocol(const struct net_device *dev,
2447
- struct sk_buff *skb)
2448
-{
2449
- if (dev->type == ARPHRD_ETHER) {
2450
- skb_reset_mac_header(skb);
2451
- skb->protocol = eth_hdr(skb)->h_proto;
2452
- }
24532495 }
24542496
24552497 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
....@@ -2499,7 +2541,7 @@
24992541 skb->priority = po->sk.sk_priority;
25002542 skb->mark = po->sk.sk_mark;
25012543 skb->tstamp = sockc->transmit_time;
2502
- sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
2544
+ skb_setup_tx_timestamp(skb, sockc->tsflags);
25032545 skb_zcopy_set_nouarg(skb, ph.raw);
25042546
25052547 skb_reserve(skb, hlen);
....@@ -2522,8 +2564,6 @@
25222564 return err;
25232565 if (!dev_validate_header(dev, skb->data, hdrlen))
25242566 return -EINVAL;
2525
- if (!skb->protocol)
2526
- tpacket_set_protocol(dev, skb);
25272567
25282568 data += hdrlen;
25292569 to_write -= hdrlen;
....@@ -2558,7 +2598,7 @@
25582598 len = ((to_write > len_max) ? len_max : to_write);
25592599 }
25602600
2561
- skb_probe_transport_header(skb, 0);
2601
+ packet_parse_headers(skb, sock);
25622602
25632603 return tp_len;
25642604 }
....@@ -2788,9 +2828,11 @@
27882828 packet_inc_pending(&po->tx_ring);
27892829
27902830 status = TP_STATUS_SEND_REQUEST;
2791
- err = po->xmit(skb);
2792
- if (unlikely(err > 0)) {
2793
- err = net_xmit_errno(err);
2831
+ /* Paired with WRITE_ONCE() in packet_setsockopt() */
2832
+ err = READ_ONCE(po->xmit)(skb);
2833
+ if (unlikely(err != 0)) {
2834
+ if (err > 0)
2835
+ err = net_xmit_errno(err);
27942836 if (err && __packet_get_status(po, ph) ==
27952837 TP_STATUS_AVAILABLE) {
27962838 /* skb was destructed already */
....@@ -2957,13 +2999,13 @@
29572999 if (err)
29583000 goto out_free;
29593001
2960
- if (sock->type == SOCK_RAW &&
2961
- !dev_validate_header(dev, skb->data, len)) {
3002
+ if ((sock->type == SOCK_RAW &&
3003
+ !dev_validate_header(dev, skb->data, len)) || !skb->len) {
29623004 err = -EINVAL;
29633005 goto out_free;
29643006 }
29653007
2966
- sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
3008
+ skb_setup_tx_timestamp(skb, sockc.tsflags);
29673009
29683010 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
29693011 !packet_extra_vlan_len_allowed(dev, skb)) {
....@@ -2977,6 +3019,11 @@
29773019 skb->mark = sockc.mark;
29783020 skb->tstamp = sockc.transmit_time;
29793021
3022
+ if (unlikely(extra_len == 4))
3023
+ skb->no_fcs = 1;
3024
+
3025
+ packet_parse_headers(skb, sock);
3026
+
29803027 if (has_vnet_hdr) {
29813028 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
29823029 if (err)
....@@ -2985,14 +3032,14 @@
29853032 virtio_net_hdr_set_proto(skb, &vnet_hdr);
29863033 }
29873034
2988
- skb_probe_transport_header(skb, reserve);
2989
-
2990
- if (unlikely(extra_len == 4))
2991
- skb->no_fcs = 1;
2992
-
2993
- err = po->xmit(skb);
2994
- if (err > 0 && (err = net_xmit_errno(err)) != 0)
2995
- goto out_unlock;
3035
+ /* Paired with WRITE_ONCE() in packet_setsockopt() */
3036
+ err = READ_ONCE(po->xmit)(skb);
3037
+ if (unlikely(err != 0)) {
3038
+ if (err > 0)
3039
+ err = net_xmit_errno(err);
3040
+ if (err)
3041
+ goto out_unlock;
3042
+ }
29963043
29973044 dev_put(dev);
29983045
....@@ -3012,10 +3059,13 @@
30123059 struct sock *sk = sock->sk;
30133060 struct packet_sock *po = pkt_sk(sk);
30143061
3015
- if (po->tx_ring.pg_vec)
3062
+ /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
3063
+ * tpacket_snd() will redo the check safely.
3064
+ */
3065
+ if (data_race(po->tx_ring.pg_vec))
30163066 return tpacket_snd(po, msg);
3017
- else
3018
- return packet_snd(sock, msg, len);
3067
+
3068
+ return packet_snd(sock, msg, len);
30193069 }
30203070
30213071 /*
....@@ -3076,7 +3126,7 @@
30763126 kfree(po->rollover);
30773127 if (f) {
30783128 fanout_release_data(f);
3079
- kfree(f);
3129
+ kvfree(f);
30803130 }
30813131 /*
30823132 * Now the socket is dead. No more input will appear.
....@@ -3111,6 +3161,9 @@
31113161
31123162 lock_sock(sk);
31133163 spin_lock(&po->bind_lock);
3164
+ if (!proto)
3165
+ proto = po->num;
3166
+
31143167 rcu_read_lock();
31153168
31163169 if (po->fanout) {
....@@ -3213,7 +3266,7 @@
32133266 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
32143267 name[sizeof(uaddr->sa_data)] = 0;
32153268
3216
- return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3269
+ return packet_do_bind(sk, name, 0, 0);
32173270 }
32183271
32193272 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
....@@ -3230,8 +3283,7 @@
32303283 if (sll->sll_family != AF_PACKET)
32313284 return -EINVAL;
32323285
3233
- return packet_do_bind(sk, NULL, sll->sll_ifindex,
3234
- sll->sll_protocol ? : pkt_sk(sk)->num);
3286
+ return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
32353287 }
32363288
32373289 static struct proto packet_proto = {
....@@ -3371,8 +3423,7 @@
33713423 if (skb == NULL)
33723424 goto out;
33733425
3374
- if (pkt_sk(sk)->pressure)
3375
- packet_rcv_has_room(pkt_sk(sk), NULL);
3426
+ packet_rcv_try_clear_pressure(pkt_sk(sk));
33763427
33773428 if (pkt_sk(sk)->has_vnet_hdr) {
33783429 err = packet_rcv_vnet(msg, skb, &len);
....@@ -3407,6 +3458,8 @@
34073458 sock_recv_ts_and_drops(msg, sk, skb);
34083459
34093460 if (msg->msg_name) {
3461
+ const size_t max_len = min(sizeof(skb->cb),
3462
+ sizeof(struct sockaddr_storage));
34103463 int copy_len;
34113464
34123465 /* If the address length field is there to be filled
....@@ -3429,18 +3482,21 @@
34293482 msg->msg_namelen = sizeof(struct sockaddr_ll);
34303483 }
34313484 }
3485
+ if (WARN_ON_ONCE(copy_len > max_len)) {
3486
+ copy_len = max_len;
3487
+ msg->msg_namelen = copy_len;
3488
+ }
34323489 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
34333490 }
34343491
3435
- if (pkt_sk(sk)->auxdata) {
3492
+ if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
34363493 struct tpacket_auxdata aux;
34373494
34383495 aux.tp_status = TP_STATUS_USER;
34393496 if (skb->ip_summed == CHECKSUM_PARTIAL)
34403497 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
34413498 else if (skb->pkt_type != PACKET_OUTGOING &&
3442
- (skb->ip_summed == CHECKSUM_COMPLETE ||
3443
- skb_csum_unnecessary(skb)))
3499
+ skb_csum_unnecessary(skb))
34443500 aux.tp_status |= TP_STATUS_CSUM_VALID;
34453501
34463502 aux.tp_len = origlen;
....@@ -3670,7 +3726,8 @@
36703726 }
36713727
36723728 static int
3673
-packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3729
+packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3730
+ unsigned int optlen)
36743731 {
36753732 struct sock *sk = sock->sk;
36763733 struct packet_sock *po = pkt_sk(sk);
....@@ -3690,7 +3747,7 @@
36903747 return -EINVAL;
36913748 if (len > sizeof(mreq))
36923749 len = sizeof(mreq);
3693
- if (copy_from_user(&mreq, optval, len))
3750
+ if (copy_from_sockptr(&mreq, optval, len))
36943751 return -EFAULT;
36953752 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
36963753 return -EINVAL;
....@@ -3721,7 +3778,7 @@
37213778 if (optlen < len) {
37223779 ret = -EINVAL;
37233780 } else {
3724
- if (copy_from_user(&req_u.req, optval, len))
3781
+ if (copy_from_sockptr(&req_u.req, optval, len))
37253782 ret = -EFAULT;
37263783 else
37273784 ret = packet_set_ring(sk, &req_u, 0,
....@@ -3736,7 +3793,7 @@
37363793
37373794 if (optlen != sizeof(val))
37383795 return -EINVAL;
3739
- if (copy_from_user(&val, optval, sizeof(val)))
3796
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37403797 return -EFAULT;
37413798
37423799 pkt_sk(sk)->copy_thresh = val;
....@@ -3748,7 +3805,7 @@
37483805
37493806 if (optlen != sizeof(val))
37503807 return -EINVAL;
3751
- if (copy_from_user(&val, optval, sizeof(val)))
3808
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37523809 return -EFAULT;
37533810 switch (val) {
37543811 case TPACKET_V1:
....@@ -3774,7 +3831,7 @@
37743831
37753832 if (optlen != sizeof(val))
37763833 return -EINVAL;
3777
- if (copy_from_user(&val, optval, sizeof(val)))
3834
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37783835 return -EFAULT;
37793836 if (val > INT_MAX)
37803837 return -EINVAL;
....@@ -3794,7 +3851,7 @@
37943851
37953852 if (optlen != sizeof(val))
37963853 return -EINVAL;
3797
- if (copy_from_user(&val, optval, sizeof(val)))
3854
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37983855 return -EFAULT;
37993856
38003857 lock_sock(sk);
....@@ -3813,12 +3870,10 @@
38133870
38143871 if (optlen < sizeof(val))
38153872 return -EINVAL;
3816
- if (copy_from_user(&val, optval, sizeof(val)))
3873
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38173874 return -EFAULT;
38183875
3819
- lock_sock(sk);
3820
- po->auxdata = !!val;
3821
- release_sock(sk);
3876
+ packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
38223877 return 0;
38233878 }
38243879 case PACKET_ORIGDEV:
....@@ -3827,12 +3882,10 @@
38273882
38283883 if (optlen < sizeof(val))
38293884 return -EINVAL;
3830
- if (copy_from_user(&val, optval, sizeof(val)))
3885
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38313886 return -EFAULT;
38323887
3833
- lock_sock(sk);
3834
- po->origdev = !!val;
3835
- release_sock(sk);
3888
+ packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
38363889 return 0;
38373890 }
38383891 case PACKET_VNET_HDR:
....@@ -3843,7 +3896,7 @@
38433896 return -EINVAL;
38443897 if (optlen < sizeof(val))
38453898 return -EINVAL;
3846
- if (copy_from_user(&val, optval, sizeof(val)))
3899
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38473900 return -EFAULT;
38483901
38493902 lock_sock(sk);
....@@ -3862,7 +3915,7 @@
38623915
38633916 if (optlen != sizeof(val))
38643917 return -EINVAL;
3865
- if (copy_from_user(&val, optval, sizeof(val)))
3918
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38663919 return -EFAULT;
38673920
38683921 po->tp_tstamp = val;
....@@ -3870,14 +3923,14 @@
38703923 }
38713924 case PACKET_FANOUT:
38723925 {
3873
- int val;
3926
+ struct fanout_args args = { 0 };
38743927
3875
- if (optlen != sizeof(val))
3928
+ if (optlen != sizeof(int) && optlen != sizeof(args))
38763929 return -EINVAL;
3877
- if (copy_from_user(&val, optval, sizeof(val)))
3930
+ if (copy_from_sockptr(&args, optval, optlen))
38783931 return -EFAULT;
38793932
3880
- return fanout_add(sk, val & 0xffff, val >> 16);
3933
+ return fanout_add(sk, &args);
38813934 }
38823935 case PACKET_FANOUT_DATA:
38833936 {
....@@ -3887,13 +3940,27 @@
38873940
38883941 return fanout_set_data(po, optval, optlen);
38893942 }
3943
+ case PACKET_IGNORE_OUTGOING:
3944
+ {
3945
+ int val;
3946
+
3947
+ if (optlen != sizeof(val))
3948
+ return -EINVAL;
3949
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
3950
+ return -EFAULT;
3951
+ if (val < 0 || val > 1)
3952
+ return -EINVAL;
3953
+
3954
+ po->prot_hook.ignore_outgoing = !!val;
3955
+ return 0;
3956
+ }
38903957 case PACKET_TX_HAS_OFF:
38913958 {
38923959 unsigned int val;
38933960
38943961 if (optlen != sizeof(val))
38953962 return -EINVAL;
3896
- if (copy_from_user(&val, optval, sizeof(val)))
3963
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38973964 return -EFAULT;
38983965
38993966 lock_sock(sk);
....@@ -3912,10 +3979,11 @@
39123979
39133980 if (optlen != sizeof(val))
39143981 return -EINVAL;
3915
- if (copy_from_user(&val, optval, sizeof(val)))
3982
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
39163983 return -EFAULT;
39173984
3918
- po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3985
+ /* Paired with all lockless reads of po->xmit */
3986
+ WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit);
39193987 return 0;
39203988 }
39213989 default:
....@@ -3933,6 +4001,7 @@
39334001 void *data = &val;
39344002 union tpacket_stats_u st;
39354003 struct tpacket_rollover_stats rstats;
4004
+ int drops;
39364005
39374006 if (level != SOL_PACKET)
39384007 return -ENOPROTOOPT;
....@@ -3949,23 +4018,26 @@
39494018 memcpy(&st, &po->stats, sizeof(st));
39504019 memset(&po->stats, 0, sizeof(po->stats));
39514020 spin_unlock_bh(&sk->sk_receive_queue.lock);
4021
+ drops = atomic_xchg(&po->tp_drops, 0);
39524022
39534023 if (po->tp_version == TPACKET_V3) {
39544024 lv = sizeof(struct tpacket_stats_v3);
3955
- st.stats3.tp_packets += st.stats3.tp_drops;
4025
+ st.stats3.tp_drops = drops;
4026
+ st.stats3.tp_packets += drops;
39564027 data = &st.stats3;
39574028 } else {
39584029 lv = sizeof(struct tpacket_stats);
3959
- st.stats1.tp_packets += st.stats1.tp_drops;
4030
+ st.stats1.tp_drops = drops;
4031
+ st.stats1.tp_packets += drops;
39604032 data = &st.stats1;
39614033 }
39624034
39634035 break;
39644036 case PACKET_AUXDATA:
3965
- val = po->auxdata;
4037
+ val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
39664038 break;
39674039 case PACKET_ORIGDEV:
3968
- val = po->origdev;
4040
+ val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
39694041 break;
39704042 case PACKET_VNET_HDR:
39714043 val = po->has_vnet_hdr;
....@@ -4010,6 +4082,9 @@
40104082 ((u32)po->fanout->flags << 24)) :
40114083 0);
40124084 break;
4085
+ case PACKET_IGNORE_OUTGOING:
4086
+ val = po->prot_hook.ignore_outgoing;
4087
+ break;
40134088 case PACKET_ROLLOVER_STATS:
40144089 if (!po->rollover)
40154090 return -EINVAL;
....@@ -4038,28 +4113,6 @@
40384113 return 0;
40394114 }
40404115
4041
-
4042
-#ifdef CONFIG_COMPAT
4043
-static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4044
- char __user *optval, unsigned int optlen)
4045
-{
4046
- struct packet_sock *po = pkt_sk(sock->sk);
4047
-
4048
- if (level != SOL_PACKET)
4049
- return -ENOPROTOOPT;
4050
-
4051
- if (optname == PACKET_FANOUT_DATA &&
4052
- po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4053
- optval = (char __user *)get_compat_bpf_fprog(optval);
4054
- if (!optval)
4055
- return -EFAULT;
4056
- optlen = sizeof(struct sock_fprog);
4057
- }
4058
-
4059
- return packet_setsockopt(sock, level, optname, optval, optlen);
4060
-}
4061
-#endif
4062
-
40634116 static int packet_notifier(struct notifier_block *this,
40644117 unsigned long msg, void *ptr)
40654118 {
....@@ -4075,7 +4128,7 @@
40754128 case NETDEV_UNREGISTER:
40764129 if (po->mclist)
40774130 packet_dev_mclist_delete(dev, &po->mclist);
4078
- /* fallthrough */
4131
+ fallthrough;
40794132
40804133 case NETDEV_DOWN:
40814134 if (dev->ifindex == po->ifindex) {
....@@ -4135,11 +4188,6 @@
41354188 spin_unlock_bh(&sk->sk_receive_queue.lock);
41364189 return put_user(amount, (int __user *)arg);
41374190 }
4138
- case SIOCGSTAMP:
4139
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
4140
- case SIOCGSTAMPNS:
4141
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
4142
-
41434191 #ifdef CONFIG_INET
41444192 case SIOCADDRT:
41454193 case SIOCDELRT:
....@@ -4177,8 +4225,7 @@
41774225 TP_STATUS_KERNEL))
41784226 mask |= EPOLLIN | EPOLLRDNORM;
41794227 }
4180
- if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4181
- po->pressure = 0;
4228
+ packet_rcv_try_clear_pressure(po);
41824229 spin_unlock_bh(&sk->sk_receive_queue.lock);
41834230 spin_lock_bh(&sk->sk_write_queue.lock);
41844231 if (po->tx_ring.pg_vec) {
....@@ -4297,7 +4344,7 @@
42974344 struct packet_ring_buffer *rb;
42984345 struct sk_buff_head *rb_queue;
42994346 __be16 num;
4300
- int err = -EINVAL;
4347
+ int err;
43014348 /* Added to avoid minimal code churn */
43024349 struct tpacket_req *req = &req_u->req;
43034350
....@@ -4527,10 +4574,9 @@
45274574 .getname = packet_getname_spkt,
45284575 .poll = datagram_poll,
45294576 .ioctl = packet_ioctl,
4577
+ .gettstamp = sock_gettstamp,
45304578 .listen = sock_no_listen,
45314579 .shutdown = sock_no_shutdown,
4532
- .setsockopt = sock_no_setsockopt,
4533
- .getsockopt = sock_no_getsockopt,
45344580 .sendmsg = packet_sendmsg_spkt,
45354581 .recvmsg = packet_recvmsg,
45364582 .mmap = sock_no_mmap,
....@@ -4548,13 +4594,11 @@
45484594 .getname = packet_getname,
45494595 .poll = packet_poll,
45504596 .ioctl = packet_ioctl,
4597
+ .gettstamp = sock_gettstamp,
45514598 .listen = sock_no_listen,
45524599 .shutdown = sock_no_shutdown,
45534600 .setsockopt = packet_setsockopt,
45544601 .getsockopt = packet_getsockopt,
4555
-#ifdef CONFIG_COMPAT
4556
- .compat_setsockopt = compat_packet_setsockopt,
4557
-#endif
45584602 .sendmsg = packet_sendmsg,
45594603 .recvmsg = packet_recvmsg,
45604604 .mmap = packet_mmap,
....@@ -4631,9 +4675,11 @@
46314675 mutex_init(&net->packet.sklist_lock);
46324676 INIT_HLIST_HEAD(&net->packet.sklist);
46334677
4678
+#ifdef CONFIG_PROC_FS
46344679 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
46354680 sizeof(struct seq_net_private)))
46364681 return -ENOMEM;
4682
+#endif /* CONFIG_PROC_FS */
46374683
46384684 return 0;
46394685 }