hc
2024-09-20 a36159eec6ca17402b0e146b86efaf76568dc353
kernel/net/packet/af_packet.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -43,13 +44,6 @@
4344 * Chetan Loke : Implemented TPACKET_V3 block abstraction
4445 * layer.
4546 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46
- *
47
- *
48
- * This program is free software; you can redistribute it and/or
49
- * modify it under the terms of the GNU General Public License
50
- * as published by the Free Software Foundation; either version
51
- * 2 of the License, or (at your option) any later version.
52
- *
5347 */
5448
5549 #include <linux/types.h>
....@@ -99,52 +93,56 @@
9993
10094 /*
10195 Assumptions:
102
- - if device has no dev->hard_header routine, it adds and removes ll header
103
- inside itself. In this case ll header is invisible outside of device,
104
- but higher levels still should reserve dev->hard_header_len.
105
- Some devices are enough clever to reallocate skb, when header
106
- will not fit to reserved space (tunnel), another ones are silly
107
- (PPP).
96
+ - If the device has no dev->header_ops->create, there is no LL header
97
+ visible above the device. In this case, its hard_header_len should be 0.
98
+ The device may prepend its own header internally. In this case, its
99
+ needed_headroom should be set to the space needed for it to add its
100
+ internal header.
101
+ For example, a WiFi driver pretending to be an Ethernet driver should
102
+ set its hard_header_len to be the Ethernet header length, and set its
103
+ needed_headroom to be (the real WiFi header length - the fake Ethernet
104
+ header length).
108105 - packet socket receives packets with pulled ll header,
109106 so that SOCK_RAW should push it back.
110107
111108 On receive:
112109 -----------
113110
114
-Incoming, dev->hard_header!=NULL
111
+Incoming, dev_has_header(dev) == true
115112 mac_header -> ll header
116113 data -> data
117114
118
-Outgoing, dev->hard_header!=NULL
115
+Outgoing, dev_has_header(dev) == true
119116 mac_header -> ll header
120117 data -> ll header
121118
122
-Incoming, dev->hard_header==NULL
123
- mac_header -> UNKNOWN position. It is very likely, that it points to ll
124
- header. PPP makes it, that is wrong, because introduce
125
- assymetry between rx and tx paths.
119
+Incoming, dev_has_header(dev) == false
120
+ mac_header -> data
121
+ However drivers often make it point to the ll header.
122
+ This is incorrect because the ll header should be invisible to us.
126123 data -> data
127124
128
-Outgoing, dev->hard_header==NULL
129
- mac_header -> data. ll header is still not built!
125
+Outgoing, dev_has_header(dev) == false
126
+ mac_header -> data. ll header is invisible to us.
130127 data -> data
131128
132129 Resume
133
- If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
+ If dev_has_header(dev) == false we are unable to restore the ll header,
131
+ because it is invisible to us.
134132
135133
136134 On transmit:
137135 ------------
138136
139
-dev->hard_header != NULL
137
+dev->header_ops != NULL
140138 mac_header -> ll header
141139 data -> ll header
142140
143
-dev->hard_header == NULL (ll header is added by device, we cannot control it)
141
+dev->header_ops == NULL (ll header is invisible to us)
144142 mac_header -> data
145143 data -> data
146144
147
- We should set nh.raw on output to correct posistion,
145
+ We should set network_header on output to the correct position,
148146 packet classifier depends on it.
149147 */
150148
....@@ -183,7 +181,6 @@
183181 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184182 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185183 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186
-#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187184
188185 struct packet_sock;
189186 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
....@@ -272,27 +269,26 @@
272269
273270 static bool packet_use_direct_xmit(const struct packet_sock *po)
274271 {
275
- return po->xmit == packet_direct_xmit;
276
-}
277
-
278
-static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
279
- struct net_device *sb_dev)
280
-{
281
- return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
272
+ /* Paired with WRITE_ONCE() in packet_setsockopt() */
273
+ return READ_ONCE(po->xmit) == packet_direct_xmit;
282274 }
283275
284276 static u16 packet_pick_tx_queue(struct sk_buff *skb)
285277 {
286278 struct net_device *dev = skb->dev;
287279 const struct net_device_ops *ops = dev->netdev_ops;
280
+ int cpu = raw_smp_processor_id();
288281 u16 queue_index;
289282
283
+#ifdef CONFIG_XPS
284
+ skb->sender_cpu = cpu + 1;
285
+#endif
286
+ skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
290287 if (ops->ndo_select_queue) {
291
- queue_index = ops->ndo_select_queue(dev, skb, NULL,
292
- __packet_pick_tx_queue);
288
+ queue_index = ops->ndo_select_queue(dev, skb, NULL);
293289 queue_index = netdev_cap_txqueue(dev, queue_index);
294290 } else {
295
- queue_index = __packet_pick_tx_queue(dev, skb, NULL);
291
+ queue_index = netdev_pick_tx(dev, skb, NULL);
296292 }
297293
298294 return queue_index;
....@@ -370,18 +366,20 @@
370366 {
371367 union tpacket_uhdr h;
372368
369
+ /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
370
+
373371 h.raw = frame;
374372 switch (po->tp_version) {
375373 case TPACKET_V1:
376
- h.h1->tp_status = status;
374
+ WRITE_ONCE(h.h1->tp_status, status);
377375 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
378376 break;
379377 case TPACKET_V2:
380
- h.h2->tp_status = status;
378
+ WRITE_ONCE(h.h2->tp_status, status);
381379 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
382380 break;
383381 case TPACKET_V3:
384
- h.h3->tp_status = status;
382
+ WRITE_ONCE(h.h3->tp_status, status);
385383 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
386384 break;
387385 default:
....@@ -392,23 +390,25 @@
392390 smp_wmb();
393391 }
394392
395
-static int __packet_get_status(struct packet_sock *po, void *frame)
393
+static int __packet_get_status(const struct packet_sock *po, void *frame)
396394 {
397395 union tpacket_uhdr h;
398396
399397 smp_rmb();
400398
399
+ /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
400
+
401401 h.raw = frame;
402402 switch (po->tp_version) {
403403 case TPACKET_V1:
404404 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
405
- return h.h1->tp_status;
405
+ return READ_ONCE(h.h1->tp_status);
406406 case TPACKET_V2:
407407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
408
- return h.h2->tp_status;
408
+ return READ_ONCE(h.h2->tp_status);
409409 case TPACKET_V3:
410410 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411
- return h.h3->tp_status;
411
+ return READ_ONCE(h.h3->tp_status);
412412 default:
413413 WARN(1, "TPACKET version not supported.\n");
414414 BUG();
....@@ -416,17 +416,18 @@
416416 }
417417 }
418418
419
-static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
419
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
420420 unsigned int flags)
421421 {
422422 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423423
424424 if (shhwtstamps &&
425425 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
426
- ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
426
+ ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
427427 return TP_STATUS_TS_RAW_HARDWARE;
428428
429
- if (ktime_to_timespec_cond(skb->tstamp, ts))
429
+ if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
430
+ ktime_to_timespec64_cond(skb->tstamp, ts))
430431 return TP_STATUS_TS_SOFTWARE;
431432
432433 return 0;
....@@ -436,13 +437,20 @@
436437 struct sk_buff *skb)
437438 {
438439 union tpacket_uhdr h;
439
- struct timespec ts;
440
+ struct timespec64 ts;
440441 __u32 ts_status;
441442
442443 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
443444 return 0;
444445
445446 h.raw = frame;
447
+ /*
448
+ * versions 1 through 3 overflow the timestamps in y2106, since they
449
+ * all store the seconds in a 32-bit unsigned integer.
450
+ * If we create a version 4, that should have a 64-bit timestamp,
451
+ * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
452
+ * nanoseconds.
453
+ */
446454 switch (po->tp_version) {
447455 case TPACKET_V1:
448456 h.h1->tp_sec = ts.tv_sec;
....@@ -468,10 +476,10 @@
468476 return ts_status;
469477 }
470478
471
-static void *packet_lookup_frame(struct packet_sock *po,
472
- struct packet_ring_buffer *rb,
473
- unsigned int position,
474
- int status)
479
+static void *packet_lookup_frame(const struct packet_sock *po,
480
+ const struct packet_ring_buffer *rb,
481
+ unsigned int position,
482
+ int status)
475483 {
476484 unsigned int pg_vec_pos, frame_offset;
477485 union tpacket_uhdr h;
....@@ -528,7 +536,7 @@
528536 int blk_size_in_bytes)
529537 {
530538 struct net_device *dev;
531
- unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
539
+ unsigned int mbits, div;
532540 struct ethtool_link_ksettings ecmd;
533541 int err;
534542
....@@ -540,31 +548,25 @@
540548 }
541549 err = __ethtool_get_link_ksettings(dev, &ecmd);
542550 rtnl_unlock();
543
- if (!err) {
544
- /*
545
- * If the link speed is so slow you don't really
546
- * need to worry about perf anyways
547
- */
548
- if (ecmd.base.speed < SPEED_1000 ||
549
- ecmd.base.speed == SPEED_UNKNOWN) {
550
- return DEFAULT_PRB_RETIRE_TOV;
551
- } else {
552
- msec = 1;
553
- div = ecmd.base.speed / 1000;
554
- }
555
- } else
551
+ if (err)
556552 return DEFAULT_PRB_RETIRE_TOV;
557553
554
+ /* If the link speed is so slow you don't really
555
+ * need to worry about perf anyways
556
+ */
557
+ if (ecmd.base.speed < SPEED_1000 ||
558
+ ecmd.base.speed == SPEED_UNKNOWN)
559
+ return DEFAULT_PRB_RETIRE_TOV;
560
+
561
+ div = ecmd.base.speed / 1000;
558562 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
559563
560564 if (div)
561565 mbits /= div;
562566
563
- tmo = mbits * msec;
564
-
565567 if (div)
566
- return tmo+1;
567
- return tmo;
568
+ return mbits + 1;
569
+ return mbits;
568570 }
569571
570572 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
....@@ -600,6 +602,7 @@
600602 req_u->req3.tp_block_size);
601603 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
602604 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
605
+ rwlock_init(&p1->blk_fill_in_prog_lock);
603606
604607 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
605608 prb_init_ft_ops(p1, req_u);
....@@ -666,10 +669,9 @@
666669 *
667670 */
668671 if (BLOCK_NUM_PKTS(pbd)) {
669
- while (atomic_read(&pkc->blk_fill_in_prog)) {
670
- /* Waiting for skb_copy_bits to finish... */
671
- cpu_relax();
672
- }
672
+ /* Waiting for skb_copy_bits to finish... */
673
+ write_lock(&pkc->blk_fill_in_prog_lock);
674
+ write_unlock(&pkc->blk_fill_in_prog_lock);
673675 }
674676
675677 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
....@@ -767,7 +769,7 @@
767769 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
768770 struct sock *sk = &po->sk;
769771
770
- if (po->stats.stats3.tp_drops)
772
+ if (atomic_read(&po->tp_drops))
771773 status |= TP_STATUS_LOSING;
772774
773775 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
....@@ -783,8 +785,8 @@
783785 * It shouldn't really happen as we don't close empty
784786 * blocks. See prb_retire_rx_blk_timer_expired().
785787 */
786
- struct timespec ts;
787
- getnstimeofday(&ts);
788
+ struct timespec64 ts;
789
+ ktime_get_real_ts64(&ts);
788790 h1->ts_last_pkt.ts_sec = ts.tv_sec;
789791 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
790792 }
....@@ -814,7 +816,7 @@
814816 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
815817 struct tpacket_block_desc *pbd1)
816818 {
817
- struct timespec ts;
819
+ struct timespec64 ts;
818820 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
819821
820822 smp_rmb();
....@@ -827,7 +829,7 @@
827829 BLOCK_NUM_PKTS(pbd1) = 0;
828830 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829831
830
- getnstimeofday(&ts);
832
+ ktime_get_real_ts64(&ts);
831833
832834 h1->ts_first_pkt.ts_sec = ts.tv_sec;
833835 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
....@@ -928,10 +930,9 @@
928930 * the timer-handler already handled this case.
929931 */
930932 if (!(status & TP_STATUS_BLK_TMO)) {
931
- while (atomic_read(&pkc->blk_fill_in_prog)) {
932
- /* Waiting for skb_copy_bits to finish... */
933
- cpu_relax();
934
- }
933
+ /* Waiting for skb_copy_bits to finish... */
934
+ write_lock(&pkc->blk_fill_in_prog_lock);
935
+ write_unlock(&pkc->blk_fill_in_prog_lock);
935936 }
936937 prb_close_block(pkc, pbd, po, status);
937938 return;
....@@ -952,7 +953,8 @@
952953 __releases(&pkc->blk_fill_in_prog_lock)
953954 {
954955 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
955
- atomic_dec(&pkc->blk_fill_in_prog);
956
+
957
+ read_unlock(&pkc->blk_fill_in_prog_lock);
956958 }
957959
958960 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
....@@ -1007,14 +1009,13 @@
10071009 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
10081010 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
10091011 BLOCK_NUM_PKTS(pbd) += 1;
1010
- atomic_inc(&pkc->blk_fill_in_prog);
1012
+ read_lock(&pkc->blk_fill_in_prog_lock);
10111013 prb_run_all_ft_ops(pkc, ppd);
10121014 }
10131015
10141016 /* Assumes caller has the sk->rx_queue.lock */
10151017 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
10161018 struct sk_buff *skb,
1017
- int status,
10181019 unsigned int len
10191020 )
10201021 {
....@@ -1086,7 +1087,7 @@
10861087 po->rx_ring.head, status);
10871088 return curr;
10881089 case TPACKET_V3:
1089
- return __packet_lookup_frame_in_block(po, skb, status, len);
1090
+ return __packet_lookup_frame_in_block(po, skb, len);
10901091 default:
10911092 WARN(1, "TPACKET version not supported\n");
10921093 BUG();
....@@ -1094,10 +1095,10 @@
10941095 }
10951096 }
10961097
1097
-static void *prb_lookup_block(struct packet_sock *po,
1098
- struct packet_ring_buffer *rb,
1099
- unsigned int idx,
1100
- int status)
1098
+static void *prb_lookup_block(const struct packet_sock *po,
1099
+ const struct packet_ring_buffer *rb,
1100
+ unsigned int idx,
1101
+ int status)
11011102 {
11021103 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
11031104 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
....@@ -1210,12 +1211,12 @@
12101211 #define ROOM_LOW 0x1
12111212 #define ROOM_NORMAL 0x2
12121213
1213
-static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1214
+static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
12141215 {
12151216 int idx, len;
12161217
1217
- len = po->rx_ring.frame_max + 1;
1218
- idx = po->rx_ring.head;
1218
+ len = READ_ONCE(po->rx_ring.frame_max) + 1;
1219
+ idx = READ_ONCE(po->rx_ring.head);
12191220 if (pow_off)
12201221 idx += len >> pow_off;
12211222 if (idx >= len)
....@@ -1223,12 +1224,12 @@
12231224 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
12241225 }
12251226
1226
-static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1227
+static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
12271228 {
12281229 int idx, len;
12291230
1230
- len = po->rx_ring.prb_bdqc.knum_blocks;
1231
- idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1231
+ len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1232
+ idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
12321233 if (pow_off)
12331234 idx += len >> pow_off;
12341235 if (idx >= len)
....@@ -1236,15 +1237,18 @@
12361237 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
12371238 }
12381239
1239
-static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1240
+static int __packet_rcv_has_room(const struct packet_sock *po,
1241
+ const struct sk_buff *skb)
12401242 {
1241
- struct sock *sk = &po->sk;
1243
+ const struct sock *sk = &po->sk;
12421244 int ret = ROOM_NONE;
12431245
12441246 if (po->prot_hook.func != tpacket_rcv) {
1245
- int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1246
- - (skb ? skb->truesize : 0);
1247
- if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1247
+ int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1248
+ int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1249
+ - (skb ? skb->truesize : 0);
1250
+
1251
+ if (avail > (rcvbuf >> ROOM_POW_OFF))
12481252 return ROOM_NORMAL;
12491253 else if (avail > 0)
12501254 return ROOM_LOW;
....@@ -1269,17 +1273,22 @@
12691273
12701274 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
12711275 {
1272
- int ret;
1273
- bool has_room;
1276
+ int pressure, ret;
12741277
1275
- spin_lock_bh(&po->sk.sk_receive_queue.lock);
12761278 ret = __packet_rcv_has_room(po, skb);
1277
- has_room = ret == ROOM_NORMAL;
1278
- if (po->pressure == has_room)
1279
- po->pressure = !has_room;
1280
- spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1279
+ pressure = ret != ROOM_NORMAL;
1280
+
1281
+ if (READ_ONCE(po->pressure) != pressure)
1282
+ WRITE_ONCE(po->pressure, pressure);
12811283
12821284 return ret;
1285
+}
1286
+
1287
+static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1288
+{
1289
+ if (READ_ONCE(po->pressure) &&
1290
+ __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1291
+ WRITE_ONCE(po->pressure, 0);
12831292 }
12841293
12851294 static void packet_sock_destruct(struct sock *sk)
....@@ -1355,7 +1364,7 @@
13551364 struct packet_sock *po, *po_next, *po_skip = NULL;
13561365 unsigned int i, j, room = ROOM_NONE;
13571366
1358
- po = pkt_sk(f->arr[idx]);
1367
+ po = pkt_sk(rcu_dereference(f->arr[idx]));
13591368
13601369 if (try_self) {
13611370 room = packet_rcv_has_room(po, skb);
....@@ -1367,8 +1376,8 @@
13671376
13681377 i = j = min_t(int, po->rollover->sock, num - 1);
13691378 do {
1370
- po_next = pkt_sk(f->arr[i]);
1371
- if (po_next != po_skip && !po_next->pressure &&
1379
+ po_next = pkt_sk(rcu_dereference(f->arr[i]));
1380
+ if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
13721381 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
13731382 if (i != j)
13741383 po->rollover->sock = i;
....@@ -1462,7 +1471,7 @@
14621471 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
14631472 idx = fanout_demux_rollover(f, skb, idx, true, num);
14641473
1465
- po = pkt_sk(f->arr[idx]);
1474
+ po = pkt_sk(rcu_dereference(f->arr[idx]));
14661475 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
14671476 }
14681477
....@@ -1476,7 +1485,7 @@
14761485 struct packet_fanout *f = po->fanout;
14771486
14781487 spin_lock(&f->lock);
1479
- f->arr[f->num_members] = sk;
1488
+ rcu_assign_pointer(f->arr[f->num_members], sk);
14801489 smp_wmb();
14811490 f->num_members++;
14821491 if (f->num_members == 1)
....@@ -1491,11 +1500,14 @@
14911500
14921501 spin_lock(&f->lock);
14931502 for (i = 0; i < f->num_members; i++) {
1494
- if (f->arr[i] == sk)
1503
+ if (rcu_dereference_protected(f->arr[i],
1504
+ lockdep_is_held(&f->lock)) == sk)
14951505 break;
14961506 }
14971507 BUG_ON(i >= f->num_members);
1498
- f->arr[i] = f->arr[f->num_members - 1];
1508
+ rcu_assign_pointer(f->arr[i],
1509
+ rcu_dereference_protected(f->arr[f->num_members - 1],
1510
+ lockdep_is_held(&f->lock)));
14991511 f->num_members--;
15001512 if (f->num_members == 0)
15011513 __dev_remove_pack(&f->prot_hook);
....@@ -1538,7 +1550,7 @@
15381550 }
15391551 }
15401552
1541
-static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1553
+static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
15421554 unsigned int len)
15431555 {
15441556 struct bpf_prog *new;
....@@ -1547,10 +1559,10 @@
15471559
15481560 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
15491561 return -EPERM;
1550
- if (len != sizeof(fprog))
1551
- return -EINVAL;
1552
- if (copy_from_user(&fprog, data, len))
1553
- return -EFAULT;
1562
+
1563
+ ret = copy_bpf_fprog_from_user(&fprog, data, len);
1564
+ if (ret)
1565
+ return ret;
15541566
15551567 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
15561568 if (ret)
....@@ -1560,7 +1572,7 @@
15601572 return 0;
15611573 }
15621574
1563
-static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1575
+static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
15641576 unsigned int len)
15651577 {
15661578 struct bpf_prog *new;
....@@ -1570,7 +1582,7 @@
15701582 return -EPERM;
15711583 if (len != sizeof(fd))
15721584 return -EINVAL;
1573
- if (copy_from_user(&fd, data, len))
1585
+ if (copy_from_sockptr(&fd, data, len))
15741586 return -EFAULT;
15751587
15761588 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
....@@ -1581,7 +1593,7 @@
15811593 return 0;
15821594 }
15831595
1584
-static int fanout_set_data(struct packet_sock *po, char __user *data,
1596
+static int fanout_set_data(struct packet_sock *po, sockptr_t data,
15851597 unsigned int len)
15861598 {
15871599 switch (po->fanout->type) {
....@@ -1633,13 +1645,15 @@
16331645 return false;
16341646 }
16351647
1636
-static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1648
+static int fanout_add(struct sock *sk, struct fanout_args *args)
16371649 {
16381650 struct packet_rollover *rollover = NULL;
16391651 struct packet_sock *po = pkt_sk(sk);
1652
+ u16 type_flags = args->type_flags;
16401653 struct packet_fanout *f, *match;
16411654 u8 type = type_flags & 0xff;
16421655 u8 flags = type_flags >> 8;
1656
+ u16 id = args->id;
16431657 int err;
16441658
16451659 switch (type) {
....@@ -1697,11 +1711,21 @@
16971711 }
16981712 }
16991713 err = -EINVAL;
1700
- if (match && match->flags != flags)
1701
- goto out;
1702
- if (!match) {
1714
+ if (match) {
1715
+ if (match->flags != flags)
1716
+ goto out;
1717
+ if (args->max_num_members &&
1718
+ args->max_num_members != match->max_num_members)
1719
+ goto out;
1720
+ } else {
1721
+ if (args->max_num_members > PACKET_FANOUT_MAX)
1722
+ goto out;
1723
+ if (!args->max_num_members)
1724
+ /* legacy PACKET_FANOUT_MAX */
1725
+ args->max_num_members = 256;
17031726 err = -ENOMEM;
1704
- match = kzalloc(sizeof(*match), GFP_KERNEL);
1727
+ match = kvzalloc(struct_size(match, arr, args->max_num_members),
1728
+ GFP_KERNEL);
17051729 if (!match)
17061730 goto out;
17071731 write_pnet(&match->net, sock_net(sk));
....@@ -1718,6 +1742,7 @@
17181742 match->prot_hook.af_packet_priv = match;
17191743 match->prot_hook.af_packet_net = read_pnet(&match->net);
17201744 match->prot_hook.id_match = match_fanout_group;
1745
+ match->max_num_members = args->max_num_members;
17211746 list_add(&match->list, &fanout_list);
17221747 }
17231748 err = -EINVAL;
....@@ -1728,7 +1753,7 @@
17281753 match->prot_hook.type == po->prot_hook.type &&
17291754 match->prot_hook.dev == po->prot_hook.dev) {
17301755 err = -ENOSPC;
1731
- if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1756
+ if (refcount_read(&match->sk_ref) < match->max_num_members) {
17321757 __dev_remove_pack(&po->prot_hook);
17331758
17341759 /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
....@@ -1745,7 +1770,7 @@
17451770
17461771 if (err && !refcount_read(&match->sk_ref)) {
17471772 list_del(&match->list);
1748
- kfree(match);
1773
+ kvfree(match);
17491774 }
17501775
17511776 out:
....@@ -1835,7 +1860,7 @@
18351860 skb_dst_drop(skb);
18361861
18371862 /* drop conntrack reference */
1838
- nf_reset(skb);
1863
+ nf_reset_ct(skb);
18391864
18401865 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
18411866
....@@ -1863,6 +1888,24 @@
18631888 return 0;
18641889 }
18651890
1891
+static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1892
+{
1893
+ int depth;
1894
+
1895
+ if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1896
+ sock->type == SOCK_RAW) {
1897
+ skb_reset_mac_header(skb);
1898
+ skb->protocol = dev_parse_header_protocol(skb);
1899
+ }
1900
+
1901
+ /* Move network header to the right position for VLAN tagged packets */
1902
+ if (likely(skb->dev->type == ARPHRD_ETHER) &&
1903
+ eth_type_vlan(skb->protocol) &&
1904
+ vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
1905
+ skb_set_network_header(skb, depth);
1906
+
1907
+ skb_probe_transport_header(skb);
1908
+}
18661909
18671910 /*
18681911 * Output a raw packet to a device layer. This bypasses all the other
....@@ -1955,7 +1998,7 @@
19551998 goto retry;
19561999 }
19572000
1958
- if (!dev_validate_header(dev, skb->data, len)) {
2001
+ if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
19592002 err = -EINVAL;
19602003 goto out_unlock;
19612004 }
....@@ -1978,12 +2021,12 @@
19782021 skb->mark = sk->sk_mark;
19792022 skb->tstamp = sockc.transmit_time;
19802023
1981
- sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
2024
+ skb_setup_tx_timestamp(skb, sockc.tsflags);
19822025
19832026 if (unlikely(extra_len == 4))
19842027 skb->no_fcs = 1;
19852028
1986
- skb_probe_transport_header(skb, 0);
2029
+ packet_parse_headers(skb, sock);
19872030
19882031 dev_queue_xmit(skb);
19892032 rcu_read_unlock();
....@@ -2060,7 +2103,7 @@
20602103
20612104 skb->dev = dev;
20622105
2063
- if (dev->header_ops) {
2106
+ if (dev_has_header(dev)) {
20642107 /* The device has an explicit notion of ll header,
20652108 * exported to higher levels.
20662109 *
....@@ -2105,7 +2148,7 @@
21052148 sll = &PACKET_SKB_CB(skb)->sa.ll;
21062149 sll->sll_hatype = dev->type;
21072150 sll->sll_pkttype = skb->pkt_type;
2108
- if (unlikely(po->origdev))
2151
+ if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
21092152 sll->sll_ifindex = orig_dev->ifindex;
21102153 else
21112154 sll->sll_ifindex = dev->ifindex;
....@@ -2125,7 +2168,7 @@
21252168 skb_dst_drop(skb);
21262169
21272170 /* drop conntrack reference */
2128
- nf_reset(skb);
2171
+ nf_reset_ct(skb);
21292172
21302173 spin_lock(&sk->sk_receive_queue.lock);
21312174 po->stats.stats1.tp_packets++;
....@@ -2137,10 +2180,8 @@
21372180
21382181 drop_n_acct:
21392182 is_drop_n_account = true;
2140
- spin_lock(&sk->sk_receive_queue.lock);
2141
- po->stats.stats1.tp_drops++;
2183
+ atomic_inc(&po->tp_drops);
21422184 atomic_inc(&sk->sk_drops);
2143
- spin_unlock(&sk->sk_receive_queue.lock);
21442185
21452186 drop_n_restore:
21462187 if (skb_head != skb->data && skb_shared(skb)) {
....@@ -2169,7 +2210,7 @@
21692210 unsigned short macoff, hdrlen;
21702211 unsigned int netoff;
21712212 struct sk_buff *copy_skb = NULL;
2172
- struct timespec ts;
2213
+ struct timespec64 ts;
21732214 __u32 ts_status;
21742215 bool is_drop_n_account = false;
21752216 unsigned int slot_id = 0;
....@@ -2191,7 +2232,7 @@
21912232 if (!net_eq(dev_net(dev), sock_net(sk)))
21922233 goto drop;
21932234
2194
- if (dev->header_ops) {
2235
+ if (dev_has_header(dev)) {
21952236 if (sk->sk_type != SOCK_DGRAM)
21962237 skb_push(skb, skb->data - skb_mac_header(skb));
21972238 else if (skb->pkt_type == PACKET_OUTGOING) {
....@@ -2206,11 +2247,16 @@
22062247 if (!res)
22072248 goto drop_n_restore;
22082249
2250
+ /* If we are flooded, just give up */
2251
+ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2252
+ atomic_inc(&po->tp_drops);
2253
+ goto drop_n_restore;
2254
+ }
2255
+
22092256 if (skb->ip_summed == CHECKSUM_PARTIAL)
22102257 status |= TP_STATUS_CSUMNOTREADY;
22112258 else if (skb->pkt_type != PACKET_OUTGOING &&
2212
- (skb->ip_summed == CHECKSUM_COMPLETE ||
2213
- skb_csum_unnecessary(skb)))
2259
+ skb_csum_unnecessary(skb))
22142260 status |= TP_STATUS_CSUM_VALID;
22152261
22162262 if (snaplen > res)
....@@ -2231,9 +2277,7 @@
22312277 macoff = netoff - maclen;
22322278 }
22332279 if (netoff > USHRT_MAX) {
2234
- spin_lock(&sk->sk_receive_queue.lock);
2235
- po->stats.stats1.tp_drops++;
2236
- spin_unlock(&sk->sk_receive_queue.lock);
2280
+ atomic_inc(&po->tp_drops);
22372281 goto drop_n_restore;
22382282 }
22392283 if (po->tp_version <= TPACKET_V2) {
....@@ -2246,8 +2290,11 @@
22462290 copy_skb = skb_get(skb);
22472291 skb_head = skb->data;
22482292 }
2249
- if (copy_skb)
2293
+ if (copy_skb) {
2294
+ memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2295
+ sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
22502296 skb_set_owner_r(copy_skb, sk);
2297
+ }
22512298 }
22522299 snaplen = po->rx_ring.frame_size - macoff;
22532300 if ((int)snaplen < 0) {
....@@ -2299,7 +2346,7 @@
22992346 * Anyways, moving it for V1/V2 only as V3 doesn't need this
23002347 * at packet level.
23012348 */
2302
- if (po->stats.stats1.tp_drops)
2349
+ if (atomic_read(&po->tp_drops))
23032350 status |= TP_STATUS_LOSING;
23042351 }
23052352
....@@ -2312,8 +2359,13 @@
23122359
23132360 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
23142361
2315
- if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2316
- getnstimeofday(&ts);
2362
+ /* Always timestamp; prefer an existing software timestamp taken
2363
+ * closer to the time of capture.
2364
+ */
2365
+ ts_status = tpacket_get_timestamp(skb, &ts,
2366
+ po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE);
2367
+ if (!ts_status)
2368
+ ktime_get_real_ts64(&ts);
23172369
23182370 status |= ts_status;
23192371
....@@ -2369,7 +2421,7 @@
23692421 sll->sll_hatype = dev->type;
23702422 sll->sll_protocol = skb->protocol;
23712423 sll->sll_pkttype = skb->pkt_type;
2372
- if (unlikely(po->origdev))
2424
+ if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
23732425 sll->sll_ifindex = orig_dev->ifindex;
23742426 else
23752427 sll->sll_ifindex = dev->ifindex;
....@@ -2412,9 +2464,9 @@
24122464 return 0;
24132465
24142466 drop_n_account:
2415
- is_drop_n_account = true;
2416
- po->stats.stats1.tp_drops++;
24172467 spin_unlock(&sk->sk_receive_queue.lock);
2468
+ atomic_inc(&po->tp_drops);
2469
+ is_drop_n_account = true;
24182470
24192471 sk->sk_data_ready(sk);
24202472 kfree_skb(copy_skb);
....@@ -2440,15 +2492,6 @@
24402492 }
24412493
24422494 sock_wfree(skb);
2443
-}
2444
-
2445
-static void tpacket_set_protocol(const struct net_device *dev,
2446
- struct sk_buff *skb)
2447
-{
2448
- if (dev->type == ARPHRD_ETHER) {
2449
- skb_reset_mac_header(skb);
2450
- skb->protocol = eth_hdr(skb)->h_proto;
2451
- }
24522495 }
24532496
24542497 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
....@@ -2498,7 +2541,7 @@
24982541 skb->priority = po->sk.sk_priority;
24992542 skb->mark = po->sk.sk_mark;
25002543 skb->tstamp = sockc->transmit_time;
2501
- sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
2544
+ skb_setup_tx_timestamp(skb, sockc->tsflags);
25022545 skb_zcopy_set_nouarg(skb, ph.raw);
25032546
25042547 skb_reserve(skb, hlen);
....@@ -2521,8 +2564,6 @@
25212564 return err;
25222565 if (!dev_validate_header(dev, skb->data, hdrlen))
25232566 return -EINVAL;
2524
- if (!skb->protocol)
2525
- tpacket_set_protocol(dev, skb);
25262567
25272568 data += hdrlen;
25282569 to_write -= hdrlen;
....@@ -2557,7 +2598,7 @@
25572598 len = ((to_write > len_max) ? len_max : to_write);
25582599 }
25592600
2560
- skb_probe_transport_header(skb, 0);
2601
+ packet_parse_headers(skb, sock);
25612602
25622603 return tp_len;
25632604 }
....@@ -2787,9 +2828,11 @@
27872828 packet_inc_pending(&po->tx_ring);
27882829
27892830 status = TP_STATUS_SEND_REQUEST;
2790
- err = po->xmit(skb);
2791
- if (unlikely(err > 0)) {
2792
- err = net_xmit_errno(err);
2831
+ /* Paired with WRITE_ONCE() in packet_setsockopt() */
2832
+ err = READ_ONCE(po->xmit)(skb);
2833
+ if (unlikely(err != 0)) {
2834
+ if (err > 0)
2835
+ err = net_xmit_errno(err);
27932836 if (err && __packet_get_status(po, ph) ==
27942837 TP_STATUS_AVAILABLE) {
27952838 /* skb was destructed already */
....@@ -2956,13 +2999,13 @@
29562999 if (err)
29573000 goto out_free;
29583001
2959
- if (sock->type == SOCK_RAW &&
2960
- !dev_validate_header(dev, skb->data, len)) {
3002
+ if ((sock->type == SOCK_RAW &&
3003
+ !dev_validate_header(dev, skb->data, len)) || !skb->len) {
29613004 err = -EINVAL;
29623005 goto out_free;
29633006 }
29643007
2965
- sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
3008
+ skb_setup_tx_timestamp(skb, sockc.tsflags);
29663009
29673010 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
29683011 !packet_extra_vlan_len_allowed(dev, skb)) {
....@@ -2976,6 +3019,11 @@
29763019 skb->mark = sockc.mark;
29773020 skb->tstamp = sockc.transmit_time;
29783021
3022
+ if (unlikely(extra_len == 4))
3023
+ skb->no_fcs = 1;
3024
+
3025
+ packet_parse_headers(skb, sock);
3026
+
29793027 if (has_vnet_hdr) {
29803028 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
29813029 if (err)
....@@ -2984,14 +3032,14 @@
29843032 virtio_net_hdr_set_proto(skb, &vnet_hdr);
29853033 }
29863034
2987
- skb_probe_transport_header(skb, reserve);
2988
-
2989
- if (unlikely(extra_len == 4))
2990
- skb->no_fcs = 1;
2991
-
2992
- err = po->xmit(skb);
2993
- if (err > 0 && (err = net_xmit_errno(err)) != 0)
2994
- goto out_unlock;
3035
+ /* Paired with WRITE_ONCE() in packet_setsockopt() */
3036
+ err = READ_ONCE(po->xmit)(skb);
3037
+ if (unlikely(err != 0)) {
3038
+ if (err > 0)
3039
+ err = net_xmit_errno(err);
3040
+ if (err)
3041
+ goto out_unlock;
3042
+ }
29953043
29963044 dev_put(dev);
29973045
....@@ -3011,10 +3059,13 @@
30113059 struct sock *sk = sock->sk;
30123060 struct packet_sock *po = pkt_sk(sk);
30133061
3014
- if (po->tx_ring.pg_vec)
3062
+ /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
3063
+ * tpacket_snd() will redo the check safely.
3064
+ */
3065
+ if (data_race(po->tx_ring.pg_vec))
30153066 return tpacket_snd(po, msg);
3016
- else
3017
- return packet_snd(sock, msg, len);
3067
+
3068
+ return packet_snd(sock, msg, len);
30183069 }
30193070
30203071 /*
....@@ -3075,7 +3126,7 @@
30753126 kfree(po->rollover);
30763127 if (f) {
30773128 fanout_release_data(f);
3078
- kfree(f);
3129
+ kvfree(f);
30793130 }
30803131 /*
30813132 * Now the socket is dead. No more input will appear.
....@@ -3110,6 +3161,9 @@
31103161
31113162 lock_sock(sk);
31123163 spin_lock(&po->bind_lock);
3164
+ if (!proto)
3165
+ proto = po->num;
3166
+
31133167 rcu_read_lock();
31143168
31153169 if (po->fanout) {
....@@ -3212,7 +3266,7 @@
32123266 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
32133267 name[sizeof(uaddr->sa_data)] = 0;
32143268
3215
- return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3269
+ return packet_do_bind(sk, name, 0, 0);
32163270 }
32173271
32183272 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
....@@ -3229,8 +3283,7 @@
32293283 if (sll->sll_family != AF_PACKET)
32303284 return -EINVAL;
32313285
3232
- return packet_do_bind(sk, NULL, sll->sll_ifindex,
3233
- sll->sll_protocol ? : pkt_sk(sk)->num);
3286
+ return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
32343287 }
32353288
32363289 static struct proto packet_proto = {
....@@ -3370,8 +3423,7 @@
33703423 if (skb == NULL)
33713424 goto out;
33723425
3373
- if (pkt_sk(sk)->pressure)
3374
- packet_rcv_has_room(pkt_sk(sk), NULL);
3426
+ packet_rcv_try_clear_pressure(pkt_sk(sk));
33753427
33763428 if (pkt_sk(sk)->has_vnet_hdr) {
33773429 err = packet_rcv_vnet(msg, skb, &len);
....@@ -3406,6 +3458,8 @@
34063458 sock_recv_ts_and_drops(msg, sk, skb);
34073459
34083460 if (msg->msg_name) {
3461
+ const size_t max_len = min(sizeof(skb->cb),
3462
+ sizeof(struct sockaddr_storage));
34093463 int copy_len;
34103464
34113465 /* If the address length field is there to be filled
....@@ -3428,18 +3482,21 @@
34283482 msg->msg_namelen = sizeof(struct sockaddr_ll);
34293483 }
34303484 }
3485
+ if (WARN_ON_ONCE(copy_len > max_len)) {
3486
+ copy_len = max_len;
3487
+ msg->msg_namelen = copy_len;
3488
+ }
34313489 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
34323490 }
34333491
3434
- if (pkt_sk(sk)->auxdata) {
3492
+ if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
34353493 struct tpacket_auxdata aux;
34363494
34373495 aux.tp_status = TP_STATUS_USER;
34383496 if (skb->ip_summed == CHECKSUM_PARTIAL)
34393497 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
34403498 else if (skb->pkt_type != PACKET_OUTGOING &&
3441
- (skb->ip_summed == CHECKSUM_COMPLETE ||
3442
- skb_csum_unnecessary(skb)))
3499
+ skb_csum_unnecessary(skb))
34433500 aux.tp_status |= TP_STATUS_CSUM_VALID;
34443501
34453502 aux.tp_len = origlen;
....@@ -3669,7 +3726,8 @@
36693726 }
36703727
36713728 static int
3672
-packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3729
+packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3730
+ unsigned int optlen)
36733731 {
36743732 struct sock *sk = sock->sk;
36753733 struct packet_sock *po = pkt_sk(sk);
....@@ -3689,7 +3747,7 @@
36893747 return -EINVAL;
36903748 if (len > sizeof(mreq))
36913749 len = sizeof(mreq);
3692
- if (copy_from_user(&mreq, optval, len))
3750
+ if (copy_from_sockptr(&mreq, optval, len))
36933751 return -EFAULT;
36943752 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
36953753 return -EINVAL;
....@@ -3720,7 +3778,7 @@
37203778 if (optlen < len) {
37213779 ret = -EINVAL;
37223780 } else {
3723
- if (copy_from_user(&req_u.req, optval, len))
3781
+ if (copy_from_sockptr(&req_u.req, optval, len))
37243782 ret = -EFAULT;
37253783 else
37263784 ret = packet_set_ring(sk, &req_u, 0,
....@@ -3735,7 +3793,7 @@
37353793
37363794 if (optlen != sizeof(val))
37373795 return -EINVAL;
3738
- if (copy_from_user(&val, optval, sizeof(val)))
3796
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37393797 return -EFAULT;
37403798
37413799 pkt_sk(sk)->copy_thresh = val;
....@@ -3747,7 +3805,7 @@
37473805
37483806 if (optlen != sizeof(val))
37493807 return -EINVAL;
3750
- if (copy_from_user(&val, optval, sizeof(val)))
3808
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37513809 return -EFAULT;
37523810 switch (val) {
37533811 case TPACKET_V1:
....@@ -3773,7 +3831,7 @@
37733831
37743832 if (optlen != sizeof(val))
37753833 return -EINVAL;
3776
- if (copy_from_user(&val, optval, sizeof(val)))
3834
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37773835 return -EFAULT;
37783836 if (val > INT_MAX)
37793837 return -EINVAL;
....@@ -3793,7 +3851,7 @@
37933851
37943852 if (optlen != sizeof(val))
37953853 return -EINVAL;
3796
- if (copy_from_user(&val, optval, sizeof(val)))
3854
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
37973855 return -EFAULT;
37983856
37993857 lock_sock(sk);
....@@ -3812,12 +3870,10 @@
38123870
38133871 if (optlen < sizeof(val))
38143872 return -EINVAL;
3815
- if (copy_from_user(&val, optval, sizeof(val)))
3873
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38163874 return -EFAULT;
38173875
3818
- lock_sock(sk);
3819
- po->auxdata = !!val;
3820
- release_sock(sk);
3876
+ packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
38213877 return 0;
38223878 }
38233879 case PACKET_ORIGDEV:
....@@ -3826,12 +3882,10 @@
38263882
38273883 if (optlen < sizeof(val))
38283884 return -EINVAL;
3829
- if (copy_from_user(&val, optval, sizeof(val)))
3885
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38303886 return -EFAULT;
38313887
3832
- lock_sock(sk);
3833
- po->origdev = !!val;
3834
- release_sock(sk);
3888
+ packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
38353889 return 0;
38363890 }
38373891 case PACKET_VNET_HDR:
....@@ -3842,7 +3896,7 @@
38423896 return -EINVAL;
38433897 if (optlen < sizeof(val))
38443898 return -EINVAL;
3845
- if (copy_from_user(&val, optval, sizeof(val)))
3899
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38463900 return -EFAULT;
38473901
38483902 lock_sock(sk);
....@@ -3861,7 +3915,7 @@
38613915
38623916 if (optlen != sizeof(val))
38633917 return -EINVAL;
3864
- if (copy_from_user(&val, optval, sizeof(val)))
3918
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38653919 return -EFAULT;
38663920
38673921 po->tp_tstamp = val;
....@@ -3869,14 +3923,14 @@
38693923 }
38703924 case PACKET_FANOUT:
38713925 {
3872
- int val;
3926
+ struct fanout_args args = { 0 };
38733927
3874
- if (optlen != sizeof(val))
3928
+ if (optlen != sizeof(int) && optlen != sizeof(args))
38753929 return -EINVAL;
3876
- if (copy_from_user(&val, optval, sizeof(val)))
3930
+ if (copy_from_sockptr(&args, optval, optlen))
38773931 return -EFAULT;
38783932
3879
- return fanout_add(sk, val & 0xffff, val >> 16);
3933
+ return fanout_add(sk, &args);
38803934 }
38813935 case PACKET_FANOUT_DATA:
38823936 {
....@@ -3886,13 +3940,27 @@
38863940
38873941 return fanout_set_data(po, optval, optlen);
38883942 }
3943
+ case PACKET_IGNORE_OUTGOING:
3944
+ {
3945
+ int val;
3946
+
3947
+ if (optlen != sizeof(val))
3948
+ return -EINVAL;
3949
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
3950
+ return -EFAULT;
3951
+ if (val < 0 || val > 1)
3952
+ return -EINVAL;
3953
+
3954
+ po->prot_hook.ignore_outgoing = !!val;
3955
+ return 0;
3956
+ }
38893957 case PACKET_TX_HAS_OFF:
38903958 {
38913959 unsigned int val;
38923960
38933961 if (optlen != sizeof(val))
38943962 return -EINVAL;
3895
- if (copy_from_user(&val, optval, sizeof(val)))
3963
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
38963964 return -EFAULT;
38973965
38983966 lock_sock(sk);
....@@ -3911,10 +3979,11 @@
39113979
39123980 if (optlen != sizeof(val))
39133981 return -EINVAL;
3914
- if (copy_from_user(&val, optval, sizeof(val)))
3982
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
39153983 return -EFAULT;
39163984
3917
- po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3985
+ /* Paired with all lockless reads of po->xmit */
3986
+ WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit);
39183987 return 0;
39193988 }
39203989 default:
....@@ -3932,6 +4001,7 @@
39324001 void *data = &val;
39334002 union tpacket_stats_u st;
39344003 struct tpacket_rollover_stats rstats;
4004
+ int drops;
39354005
39364006 if (level != SOL_PACKET)
39374007 return -ENOPROTOOPT;
....@@ -3948,23 +4018,26 @@
39484018 memcpy(&st, &po->stats, sizeof(st));
39494019 memset(&po->stats, 0, sizeof(po->stats));
39504020 spin_unlock_bh(&sk->sk_receive_queue.lock);
4021
+ drops = atomic_xchg(&po->tp_drops, 0);
39514022
39524023 if (po->tp_version == TPACKET_V3) {
39534024 lv = sizeof(struct tpacket_stats_v3);
3954
- st.stats3.tp_packets += st.stats3.tp_drops;
4025
+ st.stats3.tp_drops = drops;
4026
+ st.stats3.tp_packets += drops;
39554027 data = &st.stats3;
39564028 } else {
39574029 lv = sizeof(struct tpacket_stats);
3958
- st.stats1.tp_packets += st.stats1.tp_drops;
4030
+ st.stats1.tp_drops = drops;
4031
+ st.stats1.tp_packets += drops;
39594032 data = &st.stats1;
39604033 }
39614034
39624035 break;
39634036 case PACKET_AUXDATA:
3964
- val = po->auxdata;
4037
+ val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
39654038 break;
39664039 case PACKET_ORIGDEV:
3967
- val = po->origdev;
4040
+ val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
39684041 break;
39694042 case PACKET_VNET_HDR:
39704043 val = po->has_vnet_hdr;
....@@ -4009,6 +4082,9 @@
40094082 ((u32)po->fanout->flags << 24)) :
40104083 0);
40114084 break;
4085
+ case PACKET_IGNORE_OUTGOING:
4086
+ val = po->prot_hook.ignore_outgoing;
4087
+ break;
40124088 case PACKET_ROLLOVER_STATS:
40134089 if (!po->rollover)
40144090 return -EINVAL;
....@@ -4037,28 +4113,6 @@
40374113 return 0;
40384114 }
40394115
4040
-
4041
-#ifdef CONFIG_COMPAT
4042
-static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4043
- char __user *optval, unsigned int optlen)
4044
-{
4045
- struct packet_sock *po = pkt_sk(sock->sk);
4046
-
4047
- if (level != SOL_PACKET)
4048
- return -ENOPROTOOPT;
4049
-
4050
- if (optname == PACKET_FANOUT_DATA &&
4051
- po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4052
- optval = (char __user *)get_compat_bpf_fprog(optval);
4053
- if (!optval)
4054
- return -EFAULT;
4055
- optlen = sizeof(struct sock_fprog);
4056
- }
4057
-
4058
- return packet_setsockopt(sock, level, optname, optval, optlen);
4059
-}
4060
-#endif
4061
-
40624116 static int packet_notifier(struct notifier_block *this,
40634117 unsigned long msg, void *ptr)
40644118 {
....@@ -4074,7 +4128,7 @@
40744128 case NETDEV_UNREGISTER:
40754129 if (po->mclist)
40764130 packet_dev_mclist_delete(dev, &po->mclist);
4077
- /* fallthrough */
4131
+ fallthrough;
40784132
40794133 case NETDEV_DOWN:
40804134 if (dev->ifindex == po->ifindex) {
....@@ -4134,11 +4188,6 @@
41344188 spin_unlock_bh(&sk->sk_receive_queue.lock);
41354189 return put_user(amount, (int __user *)arg);
41364190 }
4137
- case SIOCGSTAMP:
4138
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
4139
- case SIOCGSTAMPNS:
4140
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
4141
-
41424191 #ifdef CONFIG_INET
41434192 case SIOCADDRT:
41444193 case SIOCDELRT:
....@@ -4176,8 +4225,7 @@
41764225 TP_STATUS_KERNEL))
41774226 mask |= EPOLLIN | EPOLLRDNORM;
41784227 }
4179
- if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4180
- po->pressure = 0;
4228
+ packet_rcv_try_clear_pressure(po);
41814229 spin_unlock_bh(&sk->sk_receive_queue.lock);
41824230 spin_lock_bh(&sk->sk_write_queue.lock);
41834231 if (po->tx_ring.pg_vec) {
....@@ -4296,7 +4344,7 @@
42964344 struct packet_ring_buffer *rb;
42974345 struct sk_buff_head *rb_queue;
42984346 __be16 num;
4299
- int err = -EINVAL;
4347
+ int err;
43004348 /* Added to avoid minimal code churn */
43014349 struct tpacket_req *req = &req_u->req;
43024350
....@@ -4526,10 +4574,9 @@
45264574 .getname = packet_getname_spkt,
45274575 .poll = datagram_poll,
45284576 .ioctl = packet_ioctl,
4577
+ .gettstamp = sock_gettstamp,
45294578 .listen = sock_no_listen,
45304579 .shutdown = sock_no_shutdown,
4531
- .setsockopt = sock_no_setsockopt,
4532
- .getsockopt = sock_no_getsockopt,
45334580 .sendmsg = packet_sendmsg_spkt,
45344581 .recvmsg = packet_recvmsg,
45354582 .mmap = sock_no_mmap,
....@@ -4547,13 +4594,11 @@
45474594 .getname = packet_getname,
45484595 .poll = packet_poll,
45494596 .ioctl = packet_ioctl,
4597
+ .gettstamp = sock_gettstamp,
45504598 .listen = sock_no_listen,
45514599 .shutdown = sock_no_shutdown,
45524600 .setsockopt = packet_setsockopt,
45534601 .getsockopt = packet_getsockopt,
4554
-#ifdef CONFIG_COMPAT
4555
- .compat_setsockopt = compat_packet_setsockopt,
4556
-#endif
45574602 .sendmsg = packet_sendmsg,
45584603 .recvmsg = packet_recvmsg,
45594604 .mmap = packet_mmap,
....@@ -4630,9 +4675,11 @@
46304675 mutex_init(&net->packet.sklist_lock);
46314676 INIT_HLIST_HEAD(&net->packet.sklist);
46324677
4678
+#ifdef CONFIG_PROC_FS
46334679 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
46344680 sizeof(struct seq_net_private)))
46354681 return -ENOMEM;
4682
+#endif /* CONFIG_PROC_FS */
46364683
46374684 return 0;
46384685 }