From a36159eec6ca17402b0e146b86efaf76568dc353 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 20 Sep 2024 01:41:23 +0000 Subject: [PATCH] 重命名 AX88772C_eeprom/asix.c 为 asix_mac.c --- kernel/net/packet/af_packet.c | 562 ++++++++++++++++++++++++++++++------------------------- 1 files changed, 304 insertions(+), 258 deletions(-) diff --git a/kernel/net/packet/af_packet.c b/kernel/net/packet/af_packet.c index 51bca56..bbdb32a 100644 --- a/kernel/net/packet/af_packet.c +++ b/kernel/net/packet/af_packet.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -43,13 +44,6 @@ * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/types.h> @@ -63,7 +57,6 @@ #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> -#include <linux/delay.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -100,52 +93,56 @@ /* Assumptions: - - if device has no dev->hard_header routine, it adds and removes ll header - inside itself. In this case ll header is invisible outside of device, - but higher levels still should reserve dev->hard_header_len. - Some devices are enough clever to reallocate skb, when header - will not fit to reserved space (tunnel), another ones are silly - (PPP). + - If the device has no dev->header_ops->create, there is no LL header + visible above the device. In this case, its hard_header_len should be 0. + The device may prepend its own header internally. In this case, its + needed_headroom should be set to the space needed for it to add its + internal header. + For example, a WiFi driver pretending to be an Ethernet driver should + set its hard_header_len to be the Ethernet header length, and set its + needed_headroom to be (the real WiFi header length - the fake Ethernet + header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- -Incoming, dev->hard_header!=NULL +Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data -Outgoing, dev->hard_header!=NULL +Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header -Incoming, dev->hard_header==NULL - mac_header -> UNKNOWN position. It is very likely, that it points to ll - header. PPP makes it, that is wrong, because introduce - assymetry between rx and tx paths. +Incoming, dev_has_header(dev) == false + mac_header -> data + However drivers often make it point to the ll header. + This is incorrect because the ll header should be invisible to us. data -> data -Outgoing, dev->hard_header==NULL - mac_header -> data. ll header is still not built! +Outgoing, dev_has_header(dev) == false + mac_header -> data. ll header is invisible to us. data -> data Resume - If dev->hard_header==NULL we are unlikely to restore sensible ll header. + If dev_has_header(dev) == false we are unable to restore the ll header, + because it is invisible to us. On transmit: ------------ -dev->hard_header != NULL +dev->header_ops != NULL mac_header -> ll header data -> ll header -dev->hard_header == NULL (ll header is added by device, we cannot control it) +dev->header_ops == NULL (ll header is invisible to us) mac_header -> data data -> data - We should set nh.raw on output to correct posistion, + We should set network_header on output to the correct position, packet classifier depends on it. */ @@ -184,7 +181,6 @@ #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) -#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, @@ -273,27 +269,26 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) { - return po->xmit == packet_direct_xmit; -} - -static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) -{ - return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL); + /* Paired with WRITE_ONCE() in packet_setsockopt() */ + return READ_ONCE(po->xmit) == packet_direct_xmit; } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; + int cpu = raw_smp_processor_id(); u16 queue_index; +#ifdef CONFIG_XPS + skb->sender_cpu = cpu + 1; +#endif + skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { - queue_index = ops->ndo_select_queue(dev, skb, NULL, - __packet_pick_tx_queue); + queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { - queue_index = __packet_pick_tx_queue(dev, skb, NULL); + queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; @@ -371,18 +366,20 @@ { union tpacket_uhdr h; + /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: - h.h1->tp_status = status; + WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: - h.h2->tp_status = status; + WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: - h.h3->tp_status = status; + WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: @@ -393,23 +390,25 @@ smp_wmb(); } -static int __packet_get_status(struct packet_sock *po, void *frame) +static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); + /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); - return h.h1->tp_status; + return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); - return h.h2->tp_status; + return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); - return h.h3->tp_status; + return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -417,17 +416,18 @@ } } -static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, +static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) + ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; - if (ktime_to_timespec_cond(skb->tstamp, ts)) + if ((flags & SOF_TIMESTAMPING_SOFTWARE) && + ktime_to_timespec64_cond(skb->tstamp, ts)) return TP_STATUS_TS_SOFTWARE; return 0; @@ -437,13 +437,20 @@ struct sk_buff *skb) { union tpacket_uhdr h; - struct timespec ts; + struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) return 0; h.raw = frame; + /* + * versions 1 through 3 overflow the timestamps in y2106, since they + * all store the seconds in a 32-bit unsigned integer. + * If we create a version 4, that should have a 64-bit timestamp, + * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit + * nanoseconds. + */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; @@ -469,10 +476,10 @@ return ts_status; } -static void *packet_lookup_frame(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int position, - int status) +static void *packet_lookup_frame(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int position, + int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; @@ -529,7 +536,7 @@ int blk_size_in_bytes) { struct net_device *dev; - unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; + unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; @@ -541,31 +548,25 @@ } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); - if (!err) { - /* - * If the link speed is so slow you don't really - * need to worry about perf anyways - */ - if (ecmd.base.speed < SPEED_1000 || - ecmd.base.speed == SPEED_UNKNOWN) { - return DEFAULT_PRB_RETIRE_TOV; - } else { - msec = 1; - div = ecmd.base.speed / 1000; - } - } else + if (err) return DEFAULT_PRB_RETIRE_TOV; + /* If the link speed is so slow you don't really + * need to worry about perf anyways + */ + if (ecmd.base.speed < SPEED_1000 || + ecmd.base.speed == SPEED_UNKNOWN) + return DEFAULT_PRB_RETIRE_TOV; + + div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; - tmo = mbits * msec; - if (div) - return tmo+1; - return tmo; + return mbits + 1; + return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, @@ -601,6 +602,7 @@ req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); @@ -667,10 +669,9 @@ * */ if (BLOCK_NUM_PKTS(pbd)) { - while (atomic_read(&pkc->blk_fill_in_prog)) { - /* Waiting for skb_copy_bits to finish... */ - cpu_chill(); - } + /* Waiting for skb_copy_bits to finish... */ + write_lock(&pkc->blk_fill_in_prog_lock); + write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { @@ -768,7 +769,7 @@ struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; - if (po->stats.stats3.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; @@ -784,8 +785,8 @@ * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ - struct timespec ts; - getnstimeofday(&ts); + struct timespec64 ts; + ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } @@ -815,7 +816,7 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { - struct timespec ts; + struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); @@ -828,7 +829,7 @@ BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); - getnstimeofday(&ts); + ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; @@ -929,10 +930,9 @@ * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { - while (atomic_read(&pkc->blk_fill_in_prog)) { - /* Waiting for skb_copy_bits to finish... */ - cpu_chill(); - } + /* Waiting for skb_copy_bits to finish... */ + write_lock(&pkc->blk_fill_in_prog_lock); + write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; @@ -953,7 +953,8 @@ __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); - atomic_dec(&pkc->blk_fill_in_prog); + + read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, @@ -1008,14 +1009,13 @@ pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; - atomic_inc(&pkc->blk_fill_in_prog); + read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, - int status, unsigned int len ) { @@ -1087,7 +1087,7 @@ po->rx_ring.head, status); return curr; case TPACKET_V3: - return __packet_lookup_frame_in_block(po, skb, status, len); + return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); @@ -1095,10 +1095,10 @@ } } -static void *prb_lookup_block(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int idx, - int status) +static void *prb_lookup_block(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int idx, + int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); @@ -1211,12 +1211,12 @@ #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 -static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.frame_max + 1; - idx = po->rx_ring.head; + len = READ_ONCE(po->rx_ring.frame_max) + 1; + idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1224,12 +1224,12 @@ return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.prb_bdqc.knum_blocks; - idx = po->rx_ring.prb_bdqc.kactive_blk_num; + len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); + idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1237,15 +1237,18 @@ return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +static int __packet_rcv_has_room(const struct packet_sock *po, + const struct sk_buff *skb) { - struct sock *sk = &po->sk; + const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { - int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) - - (skb ? skb->truesize : 0); - if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + int rcvbuf = READ_ONCE(sk->sk_rcvbuf); + int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + + if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; @@ -1270,17 +1273,22 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { - int ret; - bool has_room; + int pressure, ret; - spin_lock_bh(&po->sk.sk_receive_queue.lock); ret = __packet_rcv_has_room(po, skb); - has_room = ret == ROOM_NORMAL; - if (po->pressure == has_room) - po->pressure = !has_room; - spin_unlock_bh(&po->sk.sk_receive_queue.lock); + pressure = ret != ROOM_NORMAL; + + if (READ_ONCE(po->pressure) != pressure) + WRITE_ONCE(po->pressure, pressure); return ret; +} + +static void packet_rcv_try_clear_pressure(struct packet_sock *po) +{ + if (READ_ONCE(po->pressure) && + __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + WRITE_ONCE(po->pressure, 0); } static void packet_sock_destruct(struct sock *sk) @@ -1356,7 +1364,7 @@ struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; - po = pkt_sk(f->arr[idx]); + po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); @@ -1368,8 +1376,8 @@ i = j = min_t(int, po->rollover->sock, num - 1); do { - po_next = pkt_sk(f->arr[i]); - if (po_next != po_skip && !po_next->pressure && + po_next = pkt_sk(rcu_dereference(f->arr[i])); + if (po_next != po_skip && !READ_ONCE(po_next->pressure) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; @@ -1463,7 +1471,7 @@ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); - po = pkt_sk(f->arr[idx]); + po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } @@ -1477,7 +1485,7 @@ struct packet_fanout *f = po->fanout; spin_lock(&f->lock); - f->arr[f->num_members] = sk; + rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) @@ -1492,11 +1500,14 @@ spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { - if (f->arr[i] == sk) + if (rcu_dereference_protected(f->arr[i], + lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); - f->arr[i] = f->arr[f->num_members - 1]; + rcu_assign_pointer(f->arr[i], + rcu_dereference_protected(f->arr[f->num_members - 1], + lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); @@ -1539,7 +1550,7 @@ } } -static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, +static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; @@ -1548,10 +1559,10 @@ if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; - if (len != sizeof(fprog)) - return -EINVAL; - if (copy_from_user(&fprog, data, len)) - return -EFAULT; + + ret = copy_bpf_fprog_from_user(&fprog, data, len); + if (ret) + return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) @@ -1561,7 +1572,7 @@ return 0; } -static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, +static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; @@ -1571,7 +1582,7 @@ return -EPERM; if (len != sizeof(fd)) return -EINVAL; - if (copy_from_user(&fd, data, len)) + if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); @@ -1582,7 +1593,7 @@ return 0; } -static int fanout_set_data(struct packet_sock *po, char __user *data, +static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { @@ -1634,13 +1645,15 @@ return false; } -static int fanout_add(struct sock *sk, u16 id, u16 type_flags) +static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); + u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; + u16 id = args->id; int err; switch (type) { @@ -1698,11 +1711,21 @@ } } err = -EINVAL; - if (match && match->flags != flags) - goto out; - if (!match) { + if (match) { + if (match->flags != flags) + goto out; + if (args->max_num_members && + args->max_num_members != match->max_num_members) + goto out; + } else { + if (args->max_num_members > PACKET_FANOUT_MAX) + goto out; + if (!args->max_num_members) + /* legacy PACKET_FANOUT_MAX */ + args->max_num_members = 256; err = -ENOMEM; - match = kzalloc(sizeof(*match), GFP_KERNEL); + match = kvzalloc(struct_size(match, arr, args->max_num_members), + GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); @@ -1719,6 +1742,7 @@ match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; + match->max_num_members = args->max_num_members; list_add(&match->list, &fanout_list); } err = -EINVAL; @@ -1729,7 +1753,7 @@ match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; - if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { + if (refcount_read(&match->sk_ref) < match->max_num_members) { __dev_remove_pack(&po->prot_hook); /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ @@ -1746,7 +1770,7 @@ if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); - kfree(match); + kvfree(match); } out: @@ -1836,7 +1860,7 @@ skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; @@ -1864,6 +1888,24 @@ return 0; } +static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) +{ + int depth; + + if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && + sock->type == SOCK_RAW) { + skb_reset_mac_header(skb); + skb->protocol = dev_parse_header_protocol(skb); + } + + /* Move network header to the right position for VLAN tagged packets */ + if (likely(skb->dev->type == ARPHRD_ETHER) && + eth_type_vlan(skb->protocol) && + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); + + skb_probe_transport_header(skb); +} /* * Output a raw packet to a device layer. This bypasses all the other @@ -1956,7 +1998,7 @@ goto retry; } - if (!dev_validate_header(dev, skb->data, len)) { + if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } @@ -1979,12 +2021,12 @@ skb->mark = sk->sk_mark; skb->tstamp = sockc.transmit_time; - sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sockc.tsflags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; - skb_probe_transport_header(skb, 0); + packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); @@ -2061,7 +2103,7 @@ skb->dev = dev; - if (dev->header_ops) { + if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * @@ -2106,7 +2148,7 @@ sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev)) + if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; @@ -2126,7 +2168,7 @@ skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; @@ -2138,10 +2180,8 @@ drop_n_acct: is_drop_n_account = true; - spin_lock(&sk->sk_receive_queue.lock); - po->stats.stats1.tp_drops++; + atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); - spin_unlock(&sk->sk_receive_queue.lock); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2170,7 +2210,7 @@ unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; - struct timespec ts; + struct timespec64 ts; __u32 ts_status; bool is_drop_n_account = false; unsigned int slot_id = 0; @@ -2192,7 +2232,7 @@ if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; - if (dev->header_ops) { + if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { @@ -2207,11 +2247,16 @@ if (!res) goto drop_n_restore; + /* If we are flooded, just give up */ + if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { + atomic_inc(&po->tp_drops); + goto drop_n_restore; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (snaplen > res) @@ -2232,9 +2277,7 @@ macoff = netoff - maclen; } if (netoff > USHRT_MAX) { - spin_lock(&sk->sk_receive_queue.lock); - po->stats.stats1.tp_drops++; - spin_unlock(&sk->sk_receive_queue.lock); + atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { @@ -2247,8 +2290,11 @@ copy_skb = skb_get(skb); skb_head = skb->data; } - if (copy_skb) + if (copy_skb) { + memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, + sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); + } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { @@ -2300,7 +2346,7 @@ * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ - if (po->stats.stats1.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } @@ -2313,8 +2359,13 @@ skb_copy_bits(skb, 0, h.raw + macoff, snaplen); - if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) - getnstimeofday(&ts); + /* Always timestamp; prefer an existing software timestamp taken + * closer to the time of capture. + */ + ts_status = tpacket_get_timestamp(skb, &ts, + po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE); + if (!ts_status) + ktime_get_real_ts64(&ts); status |= ts_status; @@ -2370,7 +2421,7 @@ sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev)) + if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; @@ -2413,9 +2464,9 @@ return 0; drop_n_account: - is_drop_n_account = true; - po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); + atomic_inc(&po->tp_drops); + is_drop_n_account = true; sk->sk_data_ready(sk); kfree_skb(copy_skb); @@ -2441,15 +2492,6 @@ } sock_wfree(skb); -} - -static void tpacket_set_protocol(const struct net_device *dev, - struct sk_buff *skb) -{ - if (dev->type == ARPHRD_ETHER) { - skb_reset_mac_header(skb); - skb->protocol = eth_hdr(skb)->h_proto; - } } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) @@ -2499,7 +2541,7 @@ skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; skb->tstamp = sockc->transmit_time; - sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sockc->tsflags); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); @@ -2522,8 +2564,6 @@ return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; - if (!skb->protocol) - tpacket_set_protocol(dev, skb); data += hdrlen; to_write -= hdrlen; @@ -2558,7 +2598,7 @@ len = ((to_write > len_max) ? len_max : to_write); } - skb_probe_transport_header(skb, 0); + packet_parse_headers(skb, sock); return tp_len; } @@ -2788,9 +2828,11 @@ packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; - err = po->xmit(skb); - if (unlikely(err > 0)) { - err = net_xmit_errno(err); + /* Paired with WRITE_ONCE() in packet_setsockopt() */ + err = READ_ONCE(po->xmit)(skb); + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ @@ -2957,13 +2999,13 @@ if (err) goto out_free; - if (sock->type == SOCK_RAW && - !dev_validate_header(dev, skb->data, len)) { + if ((sock->type == SOCK_RAW && + !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } - sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sockc.tsflags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { @@ -2977,6 +3019,11 @@ skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; + if (unlikely(extra_len == 4)) + skb->no_fcs = 1; + + packet_parse_headers(skb, sock); + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) @@ -2985,14 +3032,14 @@ virtio_net_hdr_set_proto(skb, &vnet_hdr); } - skb_probe_transport_header(skb, reserve); - - if (unlikely(extra_len == 4)) - skb->no_fcs = 1; - - err = po->xmit(skb); - if (err > 0 && (err = net_xmit_errno(err)) != 0) - goto out_unlock; + /* Paired with WRITE_ONCE() in packet_setsockopt() */ + err = READ_ONCE(po->xmit)(skb); + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto out_unlock; + } dev_put(dev); @@ -3012,10 +3059,13 @@ struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - if (po->tx_ring.pg_vec) + /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. + * tpacket_snd() will redo the check safely. + */ + if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); - else - return packet_snd(sock, msg, len); + + return packet_snd(sock, msg, len); } /* @@ -3076,7 +3126,7 @@ kfree(po->rollover); if (f) { fanout_release_data(f); - kfree(f); + kvfree(f); } /* * Now the socket is dead. No more input will appear. @@ -3111,6 +3161,9 @@ lock_sock(sk); spin_lock(&po->bind_lock); + if (!proto) + proto = po->num; + rcu_read_lock(); if (po->fanout) { @@ -3213,7 +3266,7 @@ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); name[sizeof(uaddr->sa_data)] = 0; - return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); + return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -3230,8 +3283,7 @@ if (sll->sll_family != AF_PACKET) return -EINVAL; - return packet_do_bind(sk, NULL, sll->sll_ifindex, - sll->sll_protocol ? : pkt_sk(sk)->num); + return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { @@ -3371,8 +3423,7 @@ if (skb == NULL) goto out; - if (pkt_sk(sk)->pressure) - packet_rcv_has_room(pkt_sk(sk), NULL); + packet_rcv_try_clear_pressure(pkt_sk(sk)); if (pkt_sk(sk)->has_vnet_hdr) { err = packet_rcv_vnet(msg, skb, &len); @@ -3407,6 +3458,8 @@ sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + const size_t max_len = min(sizeof(skb->cb), + sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled @@ -3429,18 +3482,21 @@ msg->msg_namelen = sizeof(struct sockaddr_ll); } } + if (WARN_ON_ONCE(copy_len > max_len)) { + copy_len = max_len; + msg->msg_namelen = copy_len; + } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } - if (pkt_sk(sk)->auxdata) { + if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; aux.tp_len = origlen; @@ -3670,7 +3726,8 @@ } static int -packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, + unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); @@ -3690,7 +3747,7 @@ return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); - if (copy_from_user(&mreq, optval, len)) + if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; @@ -3721,7 +3778,7 @@ if (optlen < len) { ret = -EINVAL; } else { - if (copy_from_user(&req_u.req, optval, len)) + if (copy_from_sockptr(&req_u.req, optval, len)) ret = -EFAULT; else ret = packet_set_ring(sk, &req_u, 0, @@ -3736,7 +3793,7 @@ if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; pkt_sk(sk)->copy_thresh = val; @@ -3748,7 +3805,7 @@ if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: @@ -3774,7 +3831,7 @@ if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; @@ -3794,7 +3851,7 @@ if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3813,12 +3870,10 @@ if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - po->auxdata = !!val; - release_sock(sk); + packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: @@ -3827,12 +3882,10 @@ if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - po->origdev = !!val; - release_sock(sk); + packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: @@ -3843,7 +3896,7 @@ return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3862,7 +3915,7 @@ if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; po->tp_tstamp = val; @@ -3870,14 +3923,14 @@ } case PACKET_FANOUT: { - int val; + struct fanout_args args = { 0 }; - if (optlen != sizeof(val)) + if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; - return fanout_add(sk, val & 0xffff, val >> 16); + return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { @@ -3887,13 +3940,27 @@ return fanout_set_data(po, optval, optlen); } + case PACKET_IGNORE_OUTGOING: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + if (val < 0 || val > 1) + return -EINVAL; + + po->prot_hook.ignore_outgoing = !!val; + return 0; + } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3912,10 +3979,11 @@ if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - po->xmit = val ? packet_direct_xmit : dev_queue_xmit; + /* Paired with all lockless reads of po->xmit */ + WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit); return 0; } default: @@ -3933,6 +4001,7 @@ void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; + int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3949,23 +4018,26 @@ memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); + drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); - st.stats3.tp_packets += st.stats3.tp_drops; + st.stats3.tp_drops = drops; + st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); - st.stats1.tp_packets += st.stats1.tp_drops; + st.stats1.tp_drops = drops; + st.stats1.tp_packets += drops; data = &st.stats1; } break; case PACKET_AUXDATA: - val = po->auxdata; + val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: - val = po->origdev; + val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = po->has_vnet_hdr; @@ -4010,6 +4082,9 @@ ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_IGNORE_OUTGOING: + val = po->prot_hook.ignore_outgoing; + break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; @@ -4038,28 +4113,6 @@ return 0; } - -#ifdef CONFIG_COMPAT -static int compat_packet_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct packet_sock *po = pkt_sk(sock->sk); - - if (level != SOL_PACKET) - return -ENOPROTOOPT; - - if (optname == PACKET_FANOUT_DATA && - po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) { - optval = (char __user *)get_compat_bpf_fprog(optval); - if (!optval) - return -EFAULT; - optlen = sizeof(struct sock_fprog); - } - - return packet_setsockopt(sock, level, optname, optval, optlen); -} -#endif - static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { @@ -4075,7 +4128,7 @@ case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist_delete(dev, &po->mclist); - /* fallthrough */ + fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { @@ -4135,11 +4188,6 @@ spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); - #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: @@ -4177,8 +4225,7 @@ TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } - if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - po->pressure = 0; + packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { @@ -4297,7 +4344,7 @@ struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; - int err = -EINVAL; + int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; @@ -4527,10 +4574,9 @@ .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, @@ -4548,13 +4594,11 @@ .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_packet_setsockopt, -#endif .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, @@ -4631,9 +4675,11 @@ mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); +#ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; +#endif /* CONFIG_PROC_FS */ return 0; } -- Gitblit v1.6.2