.. | .. |
---|
22 | 22 | #include <linux/net.h> |
---|
23 | 23 | #include <linux/netdevice.h> |
---|
24 | 24 | #include <linux/rculist.h> |
---|
25 | | -#include <net/xdp_sock.h> |
---|
| 25 | +#include <net/xdp_sock_drv.h> |
---|
26 | 26 | #include <net/xdp.h> |
---|
27 | 27 | |
---|
28 | 28 | #include "xsk_queue.h" |
---|
29 | 29 | #include "xdp_umem.h" |
---|
| 30 | +#include "xsk.h" |
---|
30 | 31 | |
---|
31 | 32 | #define TX_BATCH_SIZE 16 |
---|
32 | 33 | |
---|
33 | | -static struct xdp_sock *xdp_sk(struct sock *sk) |
---|
| 34 | +static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); |
---|
| 35 | + |
---|
| 36 | +void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) |
---|
34 | 37 | { |
---|
35 | | - return (struct xdp_sock *)sk; |
---|
| 38 | + if (pool->cached_need_wakeup & XDP_WAKEUP_RX) |
---|
| 39 | + return; |
---|
| 40 | + |
---|
| 41 | + pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; |
---|
| 42 | + pool->cached_need_wakeup |= XDP_WAKEUP_RX; |
---|
36 | 43 | } |
---|
| 44 | +EXPORT_SYMBOL(xsk_set_rx_need_wakeup); |
---|
37 | 45 | |
---|
38 | | -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) |
---|
| 46 | +void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) |
---|
39 | 47 | { |
---|
40 | | - return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && |
---|
41 | | - READ_ONCE(xs->umem->fq); |
---|
42 | | -} |
---|
| 48 | + struct xdp_sock *xs; |
---|
43 | 49 | |
---|
44 | | -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) |
---|
45 | | -{ |
---|
46 | | - return xskq_peek_addr(umem->fq, addr); |
---|
47 | | -} |
---|
48 | | -EXPORT_SYMBOL(xsk_umem_peek_addr); |
---|
| 50 | + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) |
---|
| 51 | + return; |
---|
49 | 52 | |
---|
50 | | -void xsk_umem_discard_addr(struct xdp_umem *umem) |
---|
51 | | -{ |
---|
52 | | - xskq_discard_addr(umem->fq); |
---|
53 | | -} |
---|
54 | | -EXPORT_SYMBOL(xsk_umem_discard_addr); |
---|
55 | | - |
---|
56 | | -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) |
---|
57 | | -{ |
---|
58 | | - void *buffer; |
---|
59 | | - u64 addr; |
---|
60 | | - int err; |
---|
61 | | - |
---|
62 | | - if (!xskq_peek_addr(xs->umem->fq, &addr) || |
---|
63 | | - len > xs->umem->chunk_size_nohr) { |
---|
64 | | - xs->rx_dropped++; |
---|
65 | | - return -ENOSPC; |
---|
| 53 | + rcu_read_lock(); |
---|
| 54 | + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { |
---|
| 55 | + xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; |
---|
66 | 56 | } |
---|
| 57 | + rcu_read_unlock(); |
---|
67 | 58 | |
---|
68 | | - addr += xs->umem->headroom; |
---|
| 59 | + pool->cached_need_wakeup |= XDP_WAKEUP_TX; |
---|
| 60 | +} |
---|
| 61 | +EXPORT_SYMBOL(xsk_set_tx_need_wakeup); |
---|
69 | 62 | |
---|
70 | | - buffer = xdp_umem_get_data(xs->umem, addr); |
---|
71 | | - memcpy(buffer, xdp->data, len); |
---|
72 | | - err = xskq_produce_batch_desc(xs->rx, addr, len); |
---|
73 | | - if (!err) { |
---|
74 | | - xskq_discard_addr(xs->umem->fq); |
---|
75 | | - xdp_return_buff(xdp); |
---|
76 | | - return 0; |
---|
| 63 | +void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) |
---|
| 64 | +{ |
---|
| 65 | + if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) |
---|
| 66 | + return; |
---|
| 67 | + |
---|
| 68 | + pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; |
---|
| 69 | + pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; |
---|
| 70 | +} |
---|
| 71 | +EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); |
---|
| 72 | + |
---|
| 73 | +void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) |
---|
| 74 | +{ |
---|
| 75 | + struct xdp_sock *xs; |
---|
| 76 | + |
---|
| 77 | + if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) |
---|
| 78 | + return; |
---|
| 79 | + |
---|
| 80 | + rcu_read_lock(); |
---|
| 81 | + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { |
---|
| 82 | + xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; |
---|
77 | 83 | } |
---|
| 84 | + rcu_read_unlock(); |
---|
78 | 85 | |
---|
79 | | - xs->rx_dropped++; |
---|
80 | | - return err; |
---|
| 86 | + pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; |
---|
| 87 | +} |
---|
| 88 | +EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); |
---|
| 89 | + |
---|
| 90 | +bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) |
---|
| 91 | +{ |
---|
| 92 | + return pool->uses_need_wakeup; |
---|
| 93 | +} |
---|
| 94 | +EXPORT_SYMBOL(xsk_uses_need_wakeup); |
---|
| 95 | + |
---|
| 96 | +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, |
---|
| 97 | + u16 queue_id) |
---|
| 98 | +{ |
---|
| 99 | + if (queue_id < dev->real_num_rx_queues) |
---|
| 100 | + return dev->_rx[queue_id].pool; |
---|
| 101 | + if (queue_id < dev->real_num_tx_queues) |
---|
| 102 | + return dev->_tx[queue_id].pool; |
---|
| 103 | + |
---|
| 104 | + return NULL; |
---|
| 105 | +} |
---|
| 106 | +EXPORT_SYMBOL(xsk_get_pool_from_qid); |
---|
| 107 | + |
---|
| 108 | +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) |
---|
| 109 | +{ |
---|
| 110 | + if (queue_id < dev->num_rx_queues) |
---|
| 111 | + dev->_rx[queue_id].pool = NULL; |
---|
| 112 | + if (queue_id < dev->num_tx_queues) |
---|
| 113 | + dev->_tx[queue_id].pool = NULL; |
---|
| 114 | +} |
---|
| 115 | + |
---|
| 116 | +/* The buffer pool is stored both in the _rx struct and the _tx struct as we do |
---|
| 117 | + * not know if the device has more tx queues than rx, or the opposite. |
---|
| 118 | + * This might also change during run time. |
---|
| 119 | + */ |
---|
| 120 | +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, |
---|
| 121 | + u16 queue_id) |
---|
| 122 | +{ |
---|
| 123 | + if (queue_id >= max_t(unsigned int, |
---|
| 124 | + dev->real_num_rx_queues, |
---|
| 125 | + dev->real_num_tx_queues)) |
---|
| 126 | + return -EINVAL; |
---|
| 127 | + |
---|
| 128 | + if (queue_id < dev->real_num_rx_queues) |
---|
| 129 | + dev->_rx[queue_id].pool = pool; |
---|
| 130 | + if (queue_id < dev->real_num_tx_queues) |
---|
| 131 | + dev->_tx[queue_id].pool = pool; |
---|
| 132 | + |
---|
| 133 | + return 0; |
---|
| 134 | +} |
---|
| 135 | + |
---|
| 136 | +void xp_release(struct xdp_buff_xsk *xskb) |
---|
| 137 | +{ |
---|
| 138 | + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; |
---|
| 139 | +} |
---|
| 140 | + |
---|
| 141 | +static u64 xp_get_handle(struct xdp_buff_xsk *xskb) |
---|
| 142 | +{ |
---|
| 143 | + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; |
---|
| 144 | + |
---|
| 145 | + offset += xskb->pool->headroom; |
---|
| 146 | + if (!xskb->pool->unaligned) |
---|
| 147 | + return xskb->orig_addr + offset; |
---|
| 148 | + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); |
---|
81 | 149 | } |
---|
82 | 150 | |
---|
83 | 151 | static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) |
---|
84 | 152 | { |
---|
85 | | - int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); |
---|
| 153 | + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); |
---|
| 154 | + u64 addr; |
---|
| 155 | + int err; |
---|
86 | 156 | |
---|
87 | | - if (err) |
---|
88 | | - xs->rx_dropped++; |
---|
| 157 | + addr = xp_get_handle(xskb); |
---|
| 158 | + err = xskq_prod_reserve_desc(xs->rx, addr, len); |
---|
| 159 | + if (err) { |
---|
| 160 | + xs->rx_queue_full++; |
---|
| 161 | + return err; |
---|
| 162 | + } |
---|
89 | 163 | |
---|
90 | | - return err; |
---|
| 164 | + xp_release(xskb); |
---|
| 165 | + return 0; |
---|
91 | 166 | } |
---|
92 | 167 | |
---|
93 | | -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) |
---|
| 168 | +static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) |
---|
| 169 | +{ |
---|
| 170 | + void *from_buf, *to_buf; |
---|
| 171 | + u32 metalen; |
---|
| 172 | + |
---|
| 173 | + if (unlikely(xdp_data_meta_unsupported(from))) { |
---|
| 174 | + from_buf = from->data; |
---|
| 175 | + to_buf = to->data; |
---|
| 176 | + metalen = 0; |
---|
| 177 | + } else { |
---|
| 178 | + from_buf = from->data_meta; |
---|
| 179 | + metalen = from->data - from->data_meta; |
---|
| 180 | + to_buf = to->data - metalen; |
---|
| 181 | + } |
---|
| 182 | + |
---|
| 183 | + memcpy(to_buf, from_buf, len + metalen); |
---|
| 184 | +} |
---|
| 185 | + |
---|
| 186 | +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, |
---|
| 187 | + bool explicit_free) |
---|
| 188 | +{ |
---|
| 189 | + struct xdp_buff *xsk_xdp; |
---|
| 190 | + int err; |
---|
| 191 | + |
---|
| 192 | + if (len > xsk_pool_get_rx_frame_size(xs->pool)) { |
---|
| 193 | + xs->rx_dropped++; |
---|
| 194 | + return -ENOSPC; |
---|
| 195 | + } |
---|
| 196 | + |
---|
| 197 | + xsk_xdp = xsk_buff_alloc(xs->pool); |
---|
| 198 | + if (!xsk_xdp) { |
---|
| 199 | + xs->rx_dropped++; |
---|
| 200 | + return -ENOSPC; |
---|
| 201 | + } |
---|
| 202 | + |
---|
| 203 | + xsk_copy_xdp(xsk_xdp, xdp, len); |
---|
| 204 | + err = __xsk_rcv_zc(xs, xsk_xdp, len); |
---|
| 205 | + if (err) { |
---|
| 206 | + xsk_buff_free(xsk_xdp); |
---|
| 207 | + return err; |
---|
| 208 | + } |
---|
| 209 | + if (explicit_free) |
---|
| 210 | + xdp_return_buff(xdp); |
---|
| 211 | + return 0; |
---|
| 212 | +} |
---|
| 213 | + |
---|
| 214 | +static bool xsk_tx_writeable(struct xdp_sock *xs) |
---|
| 215 | +{ |
---|
| 216 | + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) |
---|
| 217 | + return false; |
---|
| 218 | + |
---|
| 219 | + return true; |
---|
| 220 | +} |
---|
| 221 | + |
---|
| 222 | +static bool xsk_is_bound(struct xdp_sock *xs) |
---|
| 223 | +{ |
---|
| 224 | + if (READ_ONCE(xs->state) == XSK_BOUND) { |
---|
| 225 | + /* Matches smp_wmb() in bind(). */ |
---|
| 226 | + smp_rmb(); |
---|
| 227 | + return true; |
---|
| 228 | + } |
---|
| 229 | + return false; |
---|
| 230 | +} |
---|
| 231 | + |
---|
| 232 | +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, |
---|
| 233 | + bool explicit_free) |
---|
94 | 234 | { |
---|
95 | 235 | u32 len; |
---|
| 236 | + |
---|
| 237 | + if (!xsk_is_bound(xs)) |
---|
| 238 | + return -EINVAL; |
---|
96 | 239 | |
---|
97 | 240 | if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) |
---|
98 | 241 | return -EINVAL; |
---|
99 | 242 | |
---|
100 | 243 | len = xdp->data_end - xdp->data; |
---|
101 | 244 | |
---|
102 | | - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? |
---|
103 | | - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); |
---|
| 245 | + return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? |
---|
| 246 | + __xsk_rcv_zc(xs, xdp, len) : |
---|
| 247 | + __xsk_rcv(xs, xdp, len, explicit_free); |
---|
104 | 248 | } |
---|
105 | 249 | |
---|
106 | | -void xsk_flush(struct xdp_sock *xs) |
---|
| 250 | +static void xsk_flush(struct xdp_sock *xs) |
---|
107 | 251 | { |
---|
108 | | - xskq_produce_flush_desc(xs->rx); |
---|
109 | | - xs->sk.sk_data_ready(&xs->sk); |
---|
| 252 | + xskq_prod_submit(xs->rx); |
---|
| 253 | + __xskq_cons_release(xs->pool->fq); |
---|
| 254 | + sock_def_readable(&xs->sk); |
---|
110 | 255 | } |
---|
111 | 256 | |
---|
112 | 257 | int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) |
---|
113 | 258 | { |
---|
114 | | - u32 len = xdp->data_end - xdp->data; |
---|
115 | | - void *buffer; |
---|
116 | | - u64 addr; |
---|
117 | 259 | int err; |
---|
118 | 260 | |
---|
119 | | - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) |
---|
120 | | - return -EINVAL; |
---|
121 | | - |
---|
122 | | - if (!xskq_peek_addr(xs->umem->fq, &addr) || |
---|
123 | | - len > xs->umem->chunk_size_nohr) { |
---|
124 | | - xs->rx_dropped++; |
---|
125 | | - return -ENOSPC; |
---|
126 | | - } |
---|
127 | | - |
---|
128 | | - addr += xs->umem->headroom; |
---|
129 | | - |
---|
130 | | - buffer = xdp_umem_get_data(xs->umem, addr); |
---|
131 | | - memcpy(buffer, xdp->data, len); |
---|
132 | | - err = xskq_produce_batch_desc(xs->rx, addr, len); |
---|
133 | | - if (!err) { |
---|
134 | | - xskq_discard_addr(xs->umem->fq); |
---|
135 | | - xsk_flush(xs); |
---|
136 | | - return 0; |
---|
137 | | - } |
---|
138 | | - |
---|
139 | | - xs->rx_dropped++; |
---|
| 261 | + spin_lock_bh(&xs->rx_lock); |
---|
| 262 | + err = xsk_rcv(xs, xdp, false); |
---|
| 263 | + xsk_flush(xs); |
---|
| 264 | + spin_unlock_bh(&xs->rx_lock); |
---|
140 | 265 | return err; |
---|
141 | 266 | } |
---|
142 | 267 | |
---|
143 | | -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) |
---|
| 268 | +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) |
---|
144 | 269 | { |
---|
145 | | - xskq_produce_flush_addr_n(umem->cq, nb_entries); |
---|
146 | | -} |
---|
147 | | -EXPORT_SYMBOL(xsk_umem_complete_tx); |
---|
| 270 | + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); |
---|
| 271 | + int err; |
---|
148 | 272 | |
---|
149 | | -void xsk_umem_consume_tx_done(struct xdp_umem *umem) |
---|
| 273 | + err = xsk_rcv(xs, xdp, true); |
---|
| 274 | + if (err) |
---|
| 275 | + return err; |
---|
| 276 | + |
---|
| 277 | + if (!xs->flush_node.prev) |
---|
| 278 | + list_add(&xs->flush_node, flush_list); |
---|
| 279 | + |
---|
| 280 | + return 0; |
---|
| 281 | +} |
---|
| 282 | + |
---|
| 283 | +void __xsk_map_flush(void) |
---|
| 284 | +{ |
---|
| 285 | + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); |
---|
| 286 | + struct xdp_sock *xs, *tmp; |
---|
| 287 | + |
---|
| 288 | + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { |
---|
| 289 | + xsk_flush(xs); |
---|
| 290 | + __list_del_clearprev(&xs->flush_node); |
---|
| 291 | + } |
---|
| 292 | +} |
---|
| 293 | + |
---|
| 294 | +void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) |
---|
| 295 | +{ |
---|
| 296 | + xskq_prod_submit_n(pool->cq, nb_entries); |
---|
| 297 | +} |
---|
| 298 | +EXPORT_SYMBOL(xsk_tx_completed); |
---|
| 299 | + |
---|
| 300 | +void xsk_tx_release(struct xsk_buff_pool *pool) |
---|
150 | 301 | { |
---|
151 | 302 | struct xdp_sock *xs; |
---|
152 | 303 | |
---|
153 | 304 | rcu_read_lock(); |
---|
154 | | - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { |
---|
155 | | - xs->sk.sk_write_space(&xs->sk); |
---|
| 305 | + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { |
---|
| 306 | + __xskq_cons_release(xs->tx); |
---|
| 307 | + if (xsk_tx_writeable(xs)) |
---|
| 308 | + xs->sk.sk_write_space(&xs->sk); |
---|
156 | 309 | } |
---|
157 | 310 | rcu_read_unlock(); |
---|
158 | 311 | } |
---|
159 | | -EXPORT_SYMBOL(xsk_umem_consume_tx_done); |
---|
| 312 | +EXPORT_SYMBOL(xsk_tx_release); |
---|
160 | 313 | |
---|
161 | | -bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len) |
---|
| 314 | +bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) |
---|
162 | 315 | { |
---|
163 | | - struct xdp_desc desc; |
---|
164 | 316 | struct xdp_sock *xs; |
---|
165 | 317 | |
---|
166 | 318 | rcu_read_lock(); |
---|
167 | | - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { |
---|
168 | | - if (!xskq_peek_desc(xs->tx, &desc)) |
---|
| 319 | + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { |
---|
| 320 | + if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { |
---|
| 321 | + xs->tx->queue_empty_descs++; |
---|
169 | 322 | continue; |
---|
| 323 | + } |
---|
170 | 324 | |
---|
171 | | - if (xskq_produce_addr_lazy(umem->cq, desc.addr)) |
---|
| 325 | + /* This is the backpressure mechanism for the Tx path. |
---|
| 326 | + * Reserve space in the completion queue and only proceed |
---|
| 327 | + * if there is space in it. This avoids having to implement |
---|
| 328 | + * any buffering in the Tx path. |
---|
| 329 | + */ |
---|
| 330 | + if (xskq_prod_reserve_addr(pool->cq, desc->addr)) |
---|
172 | 331 | goto out; |
---|
173 | 332 | |
---|
174 | | - *dma = xdp_umem_get_dma(umem, desc.addr); |
---|
175 | | - *len = desc.len; |
---|
176 | | - |
---|
177 | | - xskq_discard_desc(xs->tx); |
---|
| 333 | + xskq_cons_release(xs->tx); |
---|
178 | 334 | rcu_read_unlock(); |
---|
179 | 335 | return true; |
---|
180 | 336 | } |
---|
.. | .. |
---|
183 | 339 | rcu_read_unlock(); |
---|
184 | 340 | return false; |
---|
185 | 341 | } |
---|
186 | | -EXPORT_SYMBOL(xsk_umem_consume_tx); |
---|
| 342 | +EXPORT_SYMBOL(xsk_tx_peek_desc); |
---|
187 | 343 | |
---|
188 | | -static int xsk_zc_xmit(struct sock *sk) |
---|
| 344 | +static int xsk_wakeup(struct xdp_sock *xs, u8 flags) |
---|
189 | 345 | { |
---|
190 | | - struct xdp_sock *xs = xdp_sk(sk); |
---|
191 | 346 | struct net_device *dev = xs->dev; |
---|
| 347 | + int err; |
---|
192 | 348 | |
---|
193 | | - return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); |
---|
| 349 | + rcu_read_lock(); |
---|
| 350 | + err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); |
---|
| 351 | + rcu_read_unlock(); |
---|
| 352 | + |
---|
| 353 | + return err; |
---|
| 354 | +} |
---|
| 355 | + |
---|
| 356 | +static int xsk_zc_xmit(struct xdp_sock *xs) |
---|
| 357 | +{ |
---|
| 358 | + return xsk_wakeup(xs, XDP_WAKEUP_TX); |
---|
194 | 359 | } |
---|
195 | 360 | |
---|
196 | 361 | static void xsk_destruct_skb(struct sk_buff *skb) |
---|
.. | .. |
---|
199 | 364 | struct xdp_sock *xs = xdp_sk(skb->sk); |
---|
200 | 365 | unsigned long flags; |
---|
201 | 366 | |
---|
202 | | - spin_lock_irqsave(&xs->tx_completion_lock, flags); |
---|
203 | | - WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); |
---|
204 | | - spin_unlock_irqrestore(&xs->tx_completion_lock, flags); |
---|
| 367 | + spin_lock_irqsave(&xs->pool->cq_lock, flags); |
---|
| 368 | + xskq_prod_submit_addr(xs->pool->cq, addr); |
---|
| 369 | + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); |
---|
205 | 370 | |
---|
206 | 371 | sock_wfree(skb); |
---|
207 | 372 | } |
---|
208 | 373 | |
---|
209 | | -static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, |
---|
210 | | - size_t total_len) |
---|
| 374 | +static int xsk_generic_xmit(struct sock *sk) |
---|
211 | 375 | { |
---|
212 | | - u32 max_batch = TX_BATCH_SIZE; |
---|
213 | 376 | struct xdp_sock *xs = xdp_sk(sk); |
---|
| 377 | + u32 max_batch = TX_BATCH_SIZE; |
---|
214 | 378 | bool sent_frame = false; |
---|
215 | 379 | struct xdp_desc desc; |
---|
216 | 380 | struct sk_buff *skb; |
---|
| 381 | + unsigned long flags; |
---|
217 | 382 | int err = 0; |
---|
| 383 | + u32 hr, tr; |
---|
218 | 384 | |
---|
219 | 385 | mutex_lock(&xs->mutex); |
---|
220 | 386 | |
---|
221 | 387 | if (xs->queue_id >= xs->dev->real_num_tx_queues) |
---|
222 | 388 | goto out; |
---|
223 | 389 | |
---|
224 | | - while (xskq_peek_desc(xs->tx, &desc)) { |
---|
| 390 | + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); |
---|
| 391 | + tr = xs->dev->needed_tailroom; |
---|
| 392 | + |
---|
| 393 | + while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { |
---|
225 | 394 | char *buffer; |
---|
226 | 395 | u64 addr; |
---|
227 | 396 | u32 len; |
---|
.. | .. |
---|
232 | 401 | } |
---|
233 | 402 | |
---|
234 | 403 | len = desc.len; |
---|
235 | | - skb = sock_alloc_send_skb(sk, len, 1, &err); |
---|
| 404 | + skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err); |
---|
236 | 405 | if (unlikely(!skb)) |
---|
237 | 406 | goto out; |
---|
238 | 407 | |
---|
| 408 | + skb_reserve(skb, hr); |
---|
239 | 409 | skb_put(skb, len); |
---|
| 410 | + |
---|
240 | 411 | addr = desc.addr; |
---|
241 | | - buffer = xdp_umem_get_data(xs->umem, addr); |
---|
| 412 | + buffer = xsk_buff_raw_get_data(xs->pool, addr); |
---|
242 | 413 | err = skb_store_bits(skb, 0, buffer, len); |
---|
243 | | - if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { |
---|
| 414 | + /* This is the backpressure mechanism for the Tx path. |
---|
| 415 | + * Reserve space in the completion queue and only proceed |
---|
| 416 | + * if there is space in it. This avoids having to implement |
---|
| 417 | + * any buffering in the Tx path. |
---|
| 418 | + */ |
---|
| 419 | + spin_lock_irqsave(&xs->pool->cq_lock, flags); |
---|
| 420 | + if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { |
---|
| 421 | + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); |
---|
244 | 422 | kfree_skb(skb); |
---|
245 | 423 | goto out; |
---|
246 | 424 | } |
---|
| 425 | + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); |
---|
247 | 426 | |
---|
248 | 427 | skb->dev = xs->dev; |
---|
249 | 428 | skb->priority = sk->sk_priority; |
---|
250 | 429 | skb->mark = sk->sk_mark; |
---|
251 | | - skb_shinfo(skb)->destructor_arg = (void *)(long)addr; |
---|
| 430 | + skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; |
---|
252 | 431 | skb->destructor = xsk_destruct_skb; |
---|
253 | 432 | |
---|
254 | | - err = dev_direct_xmit(skb, xs->queue_id); |
---|
255 | | - xskq_discard_desc(xs->tx); |
---|
| 433 | + err = __dev_direct_xmit(skb, xs->queue_id); |
---|
| 434 | + if (err == NETDEV_TX_BUSY) { |
---|
| 435 | + /* Tell user-space to retry the send */ |
---|
| 436 | + skb->destructor = sock_wfree; |
---|
| 437 | + spin_lock_irqsave(&xs->pool->cq_lock, flags); |
---|
| 438 | + xskq_prod_cancel(xs->pool->cq); |
---|
| 439 | + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); |
---|
| 440 | + /* Free skb without triggering the perf drop trace */ |
---|
| 441 | + consume_skb(skb); |
---|
| 442 | + err = -EAGAIN; |
---|
| 443 | + goto out; |
---|
| 444 | + } |
---|
| 445 | + |
---|
| 446 | + xskq_cons_release(xs->tx); |
---|
256 | 447 | /* Ignore NET_XMIT_CN as packet might have been sent */ |
---|
257 | | - if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { |
---|
| 448 | + if (err == NET_XMIT_DROP) { |
---|
258 | 449 | /* SKB completed but not sent */ |
---|
259 | 450 | err = -EBUSY; |
---|
260 | 451 | goto out; |
---|
.. | .. |
---|
263 | 454 | sent_frame = true; |
---|
264 | 455 | } |
---|
265 | 456 | |
---|
| 457 | + xs->tx->queue_empty_descs++; |
---|
| 458 | + |
---|
266 | 459 | out: |
---|
267 | 460 | if (sent_frame) |
---|
268 | | - sk->sk_write_space(sk); |
---|
| 461 | + if (xsk_tx_writeable(xs)) |
---|
| 462 | + sk->sk_write_space(sk); |
---|
269 | 463 | |
---|
270 | 464 | mutex_unlock(&xs->mutex); |
---|
271 | 465 | return err; |
---|
| 466 | +} |
---|
| 467 | + |
---|
| 468 | +static int __xsk_sendmsg(struct sock *sk) |
---|
| 469 | +{ |
---|
| 470 | + struct xdp_sock *xs = xdp_sk(sk); |
---|
| 471 | + |
---|
| 472 | + if (unlikely(!(xs->dev->flags & IFF_UP))) |
---|
| 473 | + return -ENETDOWN; |
---|
| 474 | + if (unlikely(!xs->tx)) |
---|
| 475 | + return -ENOBUFS; |
---|
| 476 | + |
---|
| 477 | + return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); |
---|
272 | 478 | } |
---|
273 | 479 | |
---|
274 | 480 | static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) |
---|
.. | .. |
---|
277 | 483 | struct sock *sk = sock->sk; |
---|
278 | 484 | struct xdp_sock *xs = xdp_sk(sk); |
---|
279 | 485 | |
---|
280 | | - if (unlikely(!xs->dev)) |
---|
| 486 | + if (unlikely(!xsk_is_bound(xs))) |
---|
281 | 487 | return -ENXIO; |
---|
282 | | - if (unlikely(!(xs->dev->flags & IFF_UP))) |
---|
283 | | - return -ENETDOWN; |
---|
284 | | - if (unlikely(!xs->tx)) |
---|
285 | | - return -ENOBUFS; |
---|
286 | | - if (need_wait) |
---|
| 488 | + if (unlikely(need_wait)) |
---|
287 | 489 | return -EOPNOTSUPP; |
---|
288 | 490 | |
---|
289 | | - return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); |
---|
| 491 | + return __xsk_sendmsg(sk); |
---|
290 | 492 | } |
---|
291 | 493 | |
---|
292 | 494 | static __poll_t xsk_poll(struct file *file, struct socket *sock, |
---|
293 | 495 | struct poll_table_struct *wait) |
---|
294 | 496 | { |
---|
295 | | - __poll_t mask = datagram_poll(file, sock, wait); |
---|
| 497 | + __poll_t mask = 0; |
---|
296 | 498 | struct sock *sk = sock->sk; |
---|
297 | 499 | struct xdp_sock *xs = xdp_sk(sk); |
---|
| 500 | + struct xsk_buff_pool *pool; |
---|
298 | 501 | |
---|
299 | | - if (xs->rx && !xskq_empty_desc(xs->rx)) |
---|
| 502 | + sock_poll_wait(file, sock, wait); |
---|
| 503 | + |
---|
| 504 | + if (unlikely(!xsk_is_bound(xs))) |
---|
| 505 | + return mask; |
---|
| 506 | + |
---|
| 507 | + pool = xs->pool; |
---|
| 508 | + |
---|
| 509 | + if (pool->cached_need_wakeup) { |
---|
| 510 | + if (xs->zc) |
---|
| 511 | + xsk_wakeup(xs, pool->cached_need_wakeup); |
---|
| 512 | + else |
---|
| 513 | + /* Poll needs to drive Tx also in copy mode */ |
---|
| 514 | + __xsk_sendmsg(sk); |
---|
| 515 | + } |
---|
| 516 | + |
---|
| 517 | + if (xs->rx && !xskq_prod_is_empty(xs->rx)) |
---|
300 | 518 | mask |= EPOLLIN | EPOLLRDNORM; |
---|
301 | | - if (xs->tx && !xskq_full_desc(xs->tx)) |
---|
| 519 | + if (xs->tx && xsk_tx_writeable(xs)) |
---|
302 | 520 | mask |= EPOLLOUT | EPOLLWRNORM; |
---|
303 | 521 | |
---|
304 | 522 | return mask; |
---|
.. | .. |
---|
322 | 540 | return 0; |
---|
323 | 541 | } |
---|
324 | 542 | |
---|
| 543 | +static void xsk_unbind_dev(struct xdp_sock *xs) |
---|
| 544 | +{ |
---|
| 545 | + struct net_device *dev = xs->dev; |
---|
| 546 | + |
---|
| 547 | + if (xs->state != XSK_BOUND) |
---|
| 548 | + return; |
---|
| 549 | + WRITE_ONCE(xs->state, XSK_UNBOUND); |
---|
| 550 | + |
---|
| 551 | + /* Wait for driver to stop using the xdp socket. */ |
---|
| 552 | + xp_del_xsk(xs->pool, xs); |
---|
| 553 | + xs->dev = NULL; |
---|
| 554 | + synchronize_net(); |
---|
| 555 | + dev_put(dev); |
---|
| 556 | +} |
---|
| 557 | + |
---|
| 558 | +static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, |
---|
| 559 | + struct xdp_sock ***map_entry) |
---|
| 560 | +{ |
---|
| 561 | + struct xsk_map *map = NULL; |
---|
| 562 | + struct xsk_map_node *node; |
---|
| 563 | + |
---|
| 564 | + *map_entry = NULL; |
---|
| 565 | + |
---|
| 566 | + spin_lock_bh(&xs->map_list_lock); |
---|
| 567 | + node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, |
---|
| 568 | + node); |
---|
| 569 | + if (node) { |
---|
| 570 | + WARN_ON(xsk_map_inc(node->map)); |
---|
| 571 | + map = node->map; |
---|
| 572 | + *map_entry = node->map_entry; |
---|
| 573 | + } |
---|
| 574 | + spin_unlock_bh(&xs->map_list_lock); |
---|
| 575 | + return map; |
---|
| 576 | +} |
---|
| 577 | + |
---|
| 578 | +static void xsk_delete_from_maps(struct xdp_sock *xs) |
---|
| 579 | +{ |
---|
| 580 | + /* This function removes the current XDP socket from all the |
---|
| 581 | + * maps it resides in. We need to take extra care here, due to |
---|
| 582 | + * the two locks involved. Each map has a lock synchronizing |
---|
| 583 | + * updates to the entries, and each socket has a lock that |
---|
| 584 | + * synchronizes access to the list of maps (map_list). For |
---|
| 585 | + * deadlock avoidance the locks need to be taken in the order |
---|
| 586 | + * "map lock"->"socket map list lock". We start off by |
---|
| 587 | + * accessing the socket map list, and take a reference to the |
---|
| 588 | + * map to guarantee existence between the |
---|
| 589 | + * xsk_get_map_list_entry() and xsk_map_try_sock_delete() |
---|
| 590 | + * calls. Then we ask the map to remove the socket, which |
---|
| 591 | + * tries to remove the socket from the map. Note that there |
---|
| 592 | + * might be updates to the map between |
---|
| 593 | + * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). |
---|
| 594 | + */ |
---|
| 595 | + struct xdp_sock **map_entry = NULL; |
---|
| 596 | + struct xsk_map *map; |
---|
| 597 | + |
---|
| 598 | + while ((map = xsk_get_map_list_entry(xs, &map_entry))) { |
---|
| 599 | + xsk_map_try_sock_delete(map, xs, map_entry); |
---|
| 600 | + xsk_map_put(map); |
---|
| 601 | + } |
---|
| 602 | +} |
---|
| 603 | + |
---|
325 | 604 | static int xsk_release(struct socket *sock) |
---|
326 | 605 | { |
---|
327 | 606 | struct sock *sk = sock->sk; |
---|
.. | .. |
---|
333 | 612 | |
---|
334 | 613 | net = sock_net(sk); |
---|
335 | 614 | |
---|
| 615 | + mutex_lock(&net->xdp.lock); |
---|
| 616 | + sk_del_node_init_rcu(sk); |
---|
| 617 | + mutex_unlock(&net->xdp.lock); |
---|
| 618 | + |
---|
336 | 619 | local_bh_disable(); |
---|
337 | 620 | sock_prot_inuse_add(net, sk->sk_prot, -1); |
---|
338 | 621 | local_bh_enable(); |
---|
339 | 622 | |
---|
340 | | - if (xs->dev) { |
---|
341 | | - struct net_device *dev = xs->dev; |
---|
342 | | - |
---|
343 | | - /* Wait for driver to stop using the xdp socket. */ |
---|
344 | | - xdp_del_sk_umem(xs->umem, xs); |
---|
345 | | - xs->dev = NULL; |
---|
346 | | - synchronize_net(); |
---|
347 | | - dev_put(dev); |
---|
348 | | - } |
---|
| 623 | + xsk_delete_from_maps(xs); |
---|
| 624 | + mutex_lock(&xs->mutex); |
---|
| 625 | + xsk_unbind_dev(xs); |
---|
| 626 | + mutex_unlock(&xs->mutex); |
---|
349 | 627 | |
---|
350 | 628 | xskq_destroy(xs->rx); |
---|
351 | 629 | xskq_destroy(xs->tx); |
---|
| 630 | + xskq_destroy(xs->fq_tmp); |
---|
| 631 | + xskq_destroy(xs->cq_tmp); |
---|
352 | 632 | |
---|
353 | 633 | sock_orphan(sk); |
---|
354 | 634 | sock->sk = NULL; |
---|
.. | .. |
---|
376 | 656 | return sock; |
---|
377 | 657 | } |
---|
378 | 658 | |
---|
| 659 | +static bool xsk_validate_queues(struct xdp_sock *xs) |
---|
| 660 | +{ |
---|
| 661 | + return xs->fq_tmp && xs->cq_tmp; |
---|
| 662 | +} |
---|
| 663 | + |
---|
379 | 664 | static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) |
---|
380 | 665 | { |
---|
381 | 666 | struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; |
---|
382 | 667 | struct sock *sk = sock->sk; |
---|
383 | 668 | struct xdp_sock *xs = xdp_sk(sk); |
---|
384 | 669 | struct net_device *dev; |
---|
| 670 | + int bound_dev_if; |
---|
385 | 671 | u32 flags, qid; |
---|
386 | 672 | int err = 0; |
---|
387 | 673 | |
---|
.. | .. |
---|
390 | 676 | if (sxdp->sxdp_family != AF_XDP) |
---|
391 | 677 | return -EINVAL; |
---|
392 | 678 | |
---|
| 679 | + flags = sxdp->sxdp_flags; |
---|
| 680 | + if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | |
---|
| 681 | + XDP_USE_NEED_WAKEUP)) |
---|
| 682 | + return -EINVAL; |
---|
| 683 | + |
---|
| 684 | + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); |
---|
| 685 | + if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) |
---|
| 686 | + return -EINVAL; |
---|
| 687 | + |
---|
| 688 | + rtnl_lock(); |
---|
393 | 689 | mutex_lock(&xs->mutex); |
---|
394 | | - if (xs->dev) { |
---|
| 690 | + if (xs->state != XSK_READY) { |
---|
395 | 691 | err = -EBUSY; |
---|
396 | 692 | goto out_release; |
---|
397 | 693 | } |
---|
.. | .. |
---|
409 | 705 | |
---|
410 | 706 | qid = sxdp->sxdp_queue_id; |
---|
411 | 707 | |
---|
412 | | - if ((xs->rx && qid >= dev->real_num_rx_queues) || |
---|
413 | | - (xs->tx && qid >= dev->real_num_tx_queues)) { |
---|
414 | | - err = -EINVAL; |
---|
415 | | - goto out_unlock; |
---|
416 | | - } |
---|
417 | | - |
---|
418 | | - flags = sxdp->sxdp_flags; |
---|
419 | | - |
---|
420 | 708 | if (flags & XDP_SHARED_UMEM) { |
---|
421 | 709 | struct xdp_sock *umem_xs; |
---|
422 | 710 | struct socket *sock; |
---|
423 | 711 | |
---|
424 | | - if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { |
---|
| 712 | + if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || |
---|
| 713 | + (flags & XDP_USE_NEED_WAKEUP)) { |
---|
425 | 714 | /* Cannot specify flags for shared sockets. */ |
---|
426 | 715 | err = -EINVAL; |
---|
427 | 716 | goto out_unlock; |
---|
.. | .. |
---|
440 | 729 | } |
---|
441 | 730 | |
---|
442 | 731 | umem_xs = xdp_sk(sock->sk); |
---|
443 | | - if (!umem_xs->umem) { |
---|
444 | | - /* No umem to inherit. */ |
---|
| 732 | + if (!xsk_is_bound(umem_xs)) { |
---|
445 | 733 | err = -EBADF; |
---|
446 | 734 | sockfd_put(sock); |
---|
447 | 735 | goto out_unlock; |
---|
448 | | - } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { |
---|
449 | | - err = -EINVAL; |
---|
450 | | - sockfd_put(sock); |
---|
451 | | - goto out_unlock; |
---|
| 736 | + } |
---|
| 737 | + |
---|
| 738 | + if (umem_xs->queue_id != qid || umem_xs->dev != dev) { |
---|
| 739 | + /* Share the umem with another socket on another qid |
---|
| 740 | + * and/or device. |
---|
| 741 | + */ |
---|
| 742 | + xs->pool = xp_create_and_assign_umem(xs, |
---|
| 743 | + umem_xs->umem); |
---|
| 744 | + if (!xs->pool) { |
---|
| 745 | + err = -ENOMEM; |
---|
| 746 | + sockfd_put(sock); |
---|
| 747 | + goto out_unlock; |
---|
| 748 | + } |
---|
| 749 | + |
---|
| 750 | + err = xp_assign_dev_shared(xs->pool, umem_xs, dev, |
---|
| 751 | + qid); |
---|
| 752 | + if (err) { |
---|
| 753 | + xp_destroy(xs->pool); |
---|
| 754 | + xs->pool = NULL; |
---|
| 755 | + sockfd_put(sock); |
---|
| 756 | + goto out_unlock; |
---|
| 757 | + } |
---|
| 758 | + } else { |
---|
| 759 | + /* Share the buffer pool with the other socket. */ |
---|
| 760 | + if (xs->fq_tmp || xs->cq_tmp) { |
---|
| 761 | + /* Do not allow setting your own fq or cq. */ |
---|
| 762 | + err = -EINVAL; |
---|
| 763 | + sockfd_put(sock); |
---|
| 764 | + goto out_unlock; |
---|
| 765 | + } |
---|
| 766 | + |
---|
| 767 | + xp_get_pool(umem_xs->pool); |
---|
| 768 | + xs->pool = umem_xs->pool; |
---|
452 | 769 | } |
---|
453 | 770 | |
---|
454 | 771 | xdp_get_umem(umem_xs->umem); |
---|
455 | 772 | WRITE_ONCE(xs->umem, umem_xs->umem); |
---|
456 | 773 | sockfd_put(sock); |
---|
457 | | - } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { |
---|
| 774 | + } else if (!xs->umem || !xsk_validate_queues(xs)) { |
---|
458 | 775 | err = -EINVAL; |
---|
459 | 776 | goto out_unlock; |
---|
460 | 777 | } else { |
---|
461 | 778 | /* This xsk has its own umem. */ |
---|
462 | | - xskq_set_umem(xs->umem->fq, &xs->umem->props); |
---|
463 | | - xskq_set_umem(xs->umem->cq, &xs->umem->props); |
---|
464 | | - |
---|
465 | | - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); |
---|
466 | | - if (err) |
---|
| 779 | + xs->pool = xp_create_and_assign_umem(xs, xs->umem); |
---|
| 780 | + if (!xs->pool) { |
---|
| 781 | + err = -ENOMEM; |
---|
467 | 782 | goto out_unlock; |
---|
| 783 | + } |
---|
| 784 | + |
---|
| 785 | + err = xp_assign_dev(xs->pool, dev, qid, flags); |
---|
| 786 | + if (err) { |
---|
| 787 | + xp_destroy(xs->pool); |
---|
| 788 | + xs->pool = NULL; |
---|
| 789 | + goto out_unlock; |
---|
| 790 | + } |
---|
468 | 791 | } |
---|
| 792 | + |
---|
| 793 | + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ |
---|
| 794 | + xs->fq_tmp = NULL; |
---|
| 795 | + xs->cq_tmp = NULL; |
---|
469 | 796 | |
---|
470 | 797 | xs->dev = dev; |
---|
471 | 798 | xs->zc = xs->umem->zc; |
---|
472 | 799 | xs->queue_id = qid; |
---|
473 | | - xskq_set_umem(xs->rx, &xs->umem->props); |
---|
474 | | - xskq_set_umem(xs->tx, &xs->umem->props); |
---|
475 | | - xdp_add_sk_umem(xs->umem, xs); |
---|
| 800 | + xp_add_xsk(xs->pool, xs); |
---|
476 | 801 | |
---|
477 | 802 | out_unlock: |
---|
478 | | - if (err) |
---|
| 803 | + if (err) { |
---|
479 | 804 | dev_put(dev); |
---|
| 805 | + } else { |
---|
| 806 | + /* Matches smp_rmb() in bind() for shared umem |
---|
| 807 | + * sockets, and xsk_is_bound(). |
---|
| 808 | + */ |
---|
| 809 | + smp_wmb(); |
---|
| 810 | + WRITE_ONCE(xs->state, XSK_BOUND); |
---|
| 811 | + } |
---|
480 | 812 | out_release: |
---|
481 | 813 | mutex_unlock(&xs->mutex); |
---|
| 814 | + rtnl_unlock(); |
---|
482 | 815 | return err; |
---|
483 | 816 | } |
---|
484 | 817 | |
---|
| 818 | +struct xdp_umem_reg_v1 { |
---|
| 819 | + __u64 addr; /* Start of packet data area */ |
---|
| 820 | + __u64 len; /* Length of packet data area */ |
---|
| 821 | + __u32 chunk_size; |
---|
| 822 | + __u32 headroom; |
---|
| 823 | +}; |
---|
| 824 | + |
---|
485 | 825 | static int xsk_setsockopt(struct socket *sock, int level, int optname, |
---|
486 | | - char __user *optval, unsigned int optlen) |
---|
| 826 | + sockptr_t optval, unsigned int optlen) |
---|
487 | 827 | { |
---|
488 | 828 | struct sock *sk = sock->sk; |
---|
489 | 829 | struct xdp_sock *xs = xdp_sk(sk); |
---|
.. | .. |
---|
501 | 841 | |
---|
502 | 842 | if (optlen < sizeof(entries)) |
---|
503 | 843 | return -EINVAL; |
---|
504 | | - if (copy_from_user(&entries, optval, sizeof(entries))) |
---|
| 844 | + if (copy_from_sockptr(&entries, optval, sizeof(entries))) |
---|
505 | 845 | return -EFAULT; |
---|
506 | 846 | |
---|
507 | 847 | mutex_lock(&xs->mutex); |
---|
| 848 | + if (xs->state != XSK_READY) { |
---|
| 849 | + mutex_unlock(&xs->mutex); |
---|
| 850 | + return -EBUSY; |
---|
| 851 | + } |
---|
508 | 852 | q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; |
---|
509 | 853 | err = xsk_init_queue(entries, q, false); |
---|
| 854 | + if (!err && optname == XDP_TX_RING) |
---|
| 855 | + /* Tx needs to be explicitly woken up the first time */ |
---|
| 856 | + xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; |
---|
510 | 857 | mutex_unlock(&xs->mutex); |
---|
511 | 858 | return err; |
---|
512 | 859 | } |
---|
513 | 860 | case XDP_UMEM_REG: |
---|
514 | 861 | { |
---|
515 | | - struct xdp_umem_reg mr; |
---|
| 862 | + size_t mr_size = sizeof(struct xdp_umem_reg); |
---|
| 863 | + struct xdp_umem_reg mr = {}; |
---|
516 | 864 | struct xdp_umem *umem; |
---|
517 | 865 | |
---|
518 | | - if (copy_from_user(&mr, optval, sizeof(mr))) |
---|
| 866 | + if (optlen < sizeof(struct xdp_umem_reg_v1)) |
---|
| 867 | + return -EINVAL; |
---|
| 868 | + else if (optlen < sizeof(mr)) |
---|
| 869 | + mr_size = sizeof(struct xdp_umem_reg_v1); |
---|
| 870 | + |
---|
| 871 | + if (copy_from_sockptr(&mr, optval, mr_size)) |
---|
519 | 872 | return -EFAULT; |
---|
520 | 873 | |
---|
521 | 874 | mutex_lock(&xs->mutex); |
---|
522 | | - if (xs->umem) { |
---|
| 875 | + if (xs->state != XSK_READY || xs->umem) { |
---|
523 | 876 | mutex_unlock(&xs->mutex); |
---|
524 | 877 | return -EBUSY; |
---|
525 | 878 | } |
---|
.. | .. |
---|
542 | 895 | struct xsk_queue **q; |
---|
543 | 896 | int entries; |
---|
544 | 897 | |
---|
545 | | - if (copy_from_user(&entries, optval, sizeof(entries))) |
---|
| 898 | + if (copy_from_sockptr(&entries, optval, sizeof(entries))) |
---|
546 | 899 | return -EFAULT; |
---|
547 | 900 | |
---|
548 | 901 | mutex_lock(&xs->mutex); |
---|
549 | | - if (!xs->umem) { |
---|
| 902 | + if (xs->state != XSK_READY) { |
---|
550 | 903 | mutex_unlock(&xs->mutex); |
---|
551 | | - return -EINVAL; |
---|
| 904 | + return -EBUSY; |
---|
552 | 905 | } |
---|
553 | 906 | |
---|
554 | | - q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : |
---|
555 | | - &xs->umem->cq; |
---|
| 907 | + q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : |
---|
| 908 | + &xs->cq_tmp; |
---|
556 | 909 | err = xsk_init_queue(entries, q, true); |
---|
557 | 910 | mutex_unlock(&xs->mutex); |
---|
558 | 911 | return err; |
---|
.. | .. |
---|
563 | 916 | |
---|
564 | 917 | return -ENOPROTOOPT; |
---|
565 | 918 | } |
---|
| 919 | + |
---|
| 920 | +static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) |
---|
| 921 | +{ |
---|
| 922 | + ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); |
---|
| 923 | + ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); |
---|
| 924 | + ring->desc = offsetof(struct xdp_rxtx_ring, desc); |
---|
| 925 | +} |
---|
| 926 | + |
---|
| 927 | +static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) |
---|
| 928 | +{ |
---|
| 929 | + ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); |
---|
| 930 | + ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); |
---|
| 931 | + ring->desc = offsetof(struct xdp_umem_ring, desc); |
---|
| 932 | +} |
---|
| 933 | + |
---|
| 934 | +struct xdp_statistics_v1 { |
---|
| 935 | + __u64 rx_dropped; |
---|
| 936 | + __u64 rx_invalid_descs; |
---|
| 937 | + __u64 tx_invalid_descs; |
---|
| 938 | +}; |
---|
566 | 939 | |
---|
567 | 940 | static int xsk_getsockopt(struct socket *sock, int level, int optname, |
---|
568 | 941 | char __user *optval, int __user *optlen) |
---|
.. | .. |
---|
582 | 955 | switch (optname) { |
---|
583 | 956 | case XDP_STATISTICS: |
---|
584 | 957 | { |
---|
585 | | - struct xdp_statistics stats; |
---|
| 958 | + struct xdp_statistics stats = {}; |
---|
| 959 | + bool extra_stats = true; |
---|
| 960 | + size_t stats_size; |
---|
586 | 961 | |
---|
587 | | - if (len < sizeof(stats)) |
---|
| 962 | + if (len < sizeof(struct xdp_statistics_v1)) { |
---|
588 | 963 | return -EINVAL; |
---|
| 964 | + } else if (len < sizeof(stats)) { |
---|
| 965 | + extra_stats = false; |
---|
| 966 | + stats_size = sizeof(struct xdp_statistics_v1); |
---|
| 967 | + } else { |
---|
| 968 | + stats_size = sizeof(stats); |
---|
| 969 | + } |
---|
589 | 970 | |
---|
590 | 971 | mutex_lock(&xs->mutex); |
---|
591 | 972 | stats.rx_dropped = xs->rx_dropped; |
---|
| 973 | + if (extra_stats) { |
---|
| 974 | + stats.rx_ring_full = xs->rx_queue_full; |
---|
| 975 | + stats.rx_fill_ring_empty_descs = |
---|
| 976 | + xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; |
---|
| 977 | + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); |
---|
| 978 | + } else { |
---|
| 979 | + stats.rx_dropped += xs->rx_queue_full; |
---|
| 980 | + } |
---|
592 | 981 | stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); |
---|
593 | 982 | stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); |
---|
594 | 983 | mutex_unlock(&xs->mutex); |
---|
595 | 984 | |
---|
596 | | - if (copy_to_user(optval, &stats, sizeof(stats))) |
---|
| 985 | + if (copy_to_user(optval, &stats, stats_size)) |
---|
597 | 986 | return -EFAULT; |
---|
598 | | - if (put_user(sizeof(stats), optlen)) |
---|
| 987 | + if (put_user(stats_size, optlen)) |
---|
599 | 988 | return -EFAULT; |
---|
600 | 989 | |
---|
601 | 990 | return 0; |
---|
.. | .. |
---|
603 | 992 | case XDP_MMAP_OFFSETS: |
---|
604 | 993 | { |
---|
605 | 994 | struct xdp_mmap_offsets off; |
---|
| 995 | + struct xdp_mmap_offsets_v1 off_v1; |
---|
| 996 | + bool flags_supported = true; |
---|
| 997 | + void *to_copy; |
---|
606 | 998 | |
---|
607 | | - if (len < sizeof(off)) |
---|
| 999 | + if (len < sizeof(off_v1)) |
---|
| 1000 | + return -EINVAL; |
---|
| 1001 | + else if (len < sizeof(off)) |
---|
| 1002 | + flags_supported = false; |
---|
| 1003 | + |
---|
| 1004 | + if (flags_supported) { |
---|
| 1005 | + /* xdp_ring_offset is identical to xdp_ring_offset_v1 |
---|
| 1006 | + * except for the flags field added to the end. |
---|
| 1007 | + */ |
---|
| 1008 | + xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) |
---|
| 1009 | + &off.rx); |
---|
| 1010 | + xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) |
---|
| 1011 | + &off.tx); |
---|
| 1012 | + xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) |
---|
| 1013 | + &off.fr); |
---|
| 1014 | + xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) |
---|
| 1015 | + &off.cr); |
---|
| 1016 | + off.rx.flags = offsetof(struct xdp_rxtx_ring, |
---|
| 1017 | + ptrs.flags); |
---|
| 1018 | + off.tx.flags = offsetof(struct xdp_rxtx_ring, |
---|
| 1019 | + ptrs.flags); |
---|
| 1020 | + off.fr.flags = offsetof(struct xdp_umem_ring, |
---|
| 1021 | + ptrs.flags); |
---|
| 1022 | + off.cr.flags = offsetof(struct xdp_umem_ring, |
---|
| 1023 | + ptrs.flags); |
---|
| 1024 | + |
---|
| 1025 | + len = sizeof(off); |
---|
| 1026 | + to_copy = &off; |
---|
| 1027 | + } else { |
---|
| 1028 | + xsk_enter_rxtx_offsets(&off_v1.rx); |
---|
| 1029 | + xsk_enter_rxtx_offsets(&off_v1.tx); |
---|
| 1030 | + xsk_enter_umem_offsets(&off_v1.fr); |
---|
| 1031 | + xsk_enter_umem_offsets(&off_v1.cr); |
---|
| 1032 | + |
---|
| 1033 | + len = sizeof(off_v1); |
---|
| 1034 | + to_copy = &off_v1; |
---|
| 1035 | + } |
---|
| 1036 | + |
---|
| 1037 | + if (copy_to_user(optval, to_copy, len)) |
---|
| 1038 | + return -EFAULT; |
---|
| 1039 | + if (put_user(len, optlen)) |
---|
| 1040 | + return -EFAULT; |
---|
| 1041 | + |
---|
| 1042 | + return 0; |
---|
| 1043 | + } |
---|
| 1044 | + case XDP_OPTIONS: |
---|
| 1045 | + { |
---|
| 1046 | + struct xdp_options opts = {}; |
---|
| 1047 | + |
---|
| 1048 | + if (len < sizeof(opts)) |
---|
608 | 1049 | return -EINVAL; |
---|
609 | 1050 | |
---|
610 | | - off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); |
---|
611 | | - off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); |
---|
612 | | - off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); |
---|
613 | | - off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); |
---|
614 | | - off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); |
---|
615 | | - off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); |
---|
| 1051 | + mutex_lock(&xs->mutex); |
---|
| 1052 | + if (xs->zc) |
---|
| 1053 | + opts.flags |= XDP_OPTIONS_ZEROCOPY; |
---|
| 1054 | + mutex_unlock(&xs->mutex); |
---|
616 | 1055 | |
---|
617 | | - off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); |
---|
618 | | - off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); |
---|
619 | | - off.fr.desc = offsetof(struct xdp_umem_ring, desc); |
---|
620 | | - off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); |
---|
621 | | - off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); |
---|
622 | | - off.cr.desc = offsetof(struct xdp_umem_ring, desc); |
---|
623 | | - |
---|
624 | | - len = sizeof(off); |
---|
625 | | - if (copy_to_user(optval, &off, len)) |
---|
| 1056 | + len = sizeof(opts); |
---|
| 1057 | + if (copy_to_user(optval, &opts, len)) |
---|
626 | 1058 | return -EFAULT; |
---|
627 | 1059 | if (put_user(len, optlen)) |
---|
628 | 1060 | return -EFAULT; |
---|
.. | .. |
---|
643 | 1075 | unsigned long size = vma->vm_end - vma->vm_start; |
---|
644 | 1076 | struct xdp_sock *xs = xdp_sk(sock->sk); |
---|
645 | 1077 | struct xsk_queue *q = NULL; |
---|
646 | | - struct xdp_umem *umem; |
---|
647 | 1078 | unsigned long pfn; |
---|
648 | 1079 | struct page *qpg; |
---|
| 1080 | + |
---|
| 1081 | + if (READ_ONCE(xs->state) != XSK_READY) |
---|
| 1082 | + return -EBUSY; |
---|
649 | 1083 | |
---|
650 | 1084 | if (offset == XDP_PGOFF_RX_RING) { |
---|
651 | 1085 | q = READ_ONCE(xs->rx); |
---|
652 | 1086 | } else if (offset == XDP_PGOFF_TX_RING) { |
---|
653 | 1087 | q = READ_ONCE(xs->tx); |
---|
654 | 1088 | } else { |
---|
655 | | - umem = READ_ONCE(xs->umem); |
---|
656 | | - if (!umem) |
---|
657 | | - return -EINVAL; |
---|
658 | | - |
---|
659 | 1089 | /* Matches the smp_wmb() in XDP_UMEM_REG */ |
---|
660 | 1090 | smp_rmb(); |
---|
661 | 1091 | if (offset == XDP_UMEM_PGOFF_FILL_RING) |
---|
662 | | - q = READ_ONCE(umem->fq); |
---|
| 1092 | + q = READ_ONCE(xs->fq_tmp); |
---|
663 | 1093 | else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) |
---|
664 | | - q = READ_ONCE(umem->cq); |
---|
| 1094 | + q = READ_ONCE(xs->cq_tmp); |
---|
665 | 1095 | } |
---|
666 | 1096 | |
---|
667 | 1097 | if (!q) |
---|
.. | .. |
---|
670 | 1100 | /* Matches the smp_wmb() in xsk_init_queue */ |
---|
671 | 1101 | smp_rmb(); |
---|
672 | 1102 | qpg = virt_to_head_page(q->ring); |
---|
673 | | - if (size > (PAGE_SIZE << compound_order(qpg))) |
---|
| 1103 | + if (size > page_size(qpg)) |
---|
674 | 1104 | return -EINVAL; |
---|
675 | 1105 | |
---|
676 | 1106 | pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; |
---|
677 | 1107 | return remap_pfn_range(vma, vma->vm_start, pfn, |
---|
678 | 1108 | size, vma->vm_page_prot); |
---|
| 1109 | +} |
---|
| 1110 | + |
---|
| 1111 | +static int xsk_notifier(struct notifier_block *this, |
---|
| 1112 | + unsigned long msg, void *ptr) |
---|
| 1113 | +{ |
---|
| 1114 | + struct net_device *dev = netdev_notifier_info_to_dev(ptr); |
---|
| 1115 | + struct net *net = dev_net(dev); |
---|
| 1116 | + struct sock *sk; |
---|
| 1117 | + |
---|
| 1118 | + switch (msg) { |
---|
| 1119 | + case NETDEV_UNREGISTER: |
---|
| 1120 | + mutex_lock(&net->xdp.lock); |
---|
| 1121 | + sk_for_each(sk, &net->xdp.list) { |
---|
| 1122 | + struct xdp_sock *xs = xdp_sk(sk); |
---|
| 1123 | + |
---|
| 1124 | + mutex_lock(&xs->mutex); |
---|
| 1125 | + if (xs->dev == dev) { |
---|
| 1126 | + sk->sk_err = ENETDOWN; |
---|
| 1127 | + if (!sock_flag(sk, SOCK_DEAD)) |
---|
| 1128 | + sk->sk_error_report(sk); |
---|
| 1129 | + |
---|
| 1130 | + xsk_unbind_dev(xs); |
---|
| 1131 | + |
---|
| 1132 | + /* Clear device references. */ |
---|
| 1133 | + xp_clear_dev(xs->pool); |
---|
| 1134 | + } |
---|
| 1135 | + mutex_unlock(&xs->mutex); |
---|
| 1136 | + } |
---|
| 1137 | + mutex_unlock(&net->xdp.lock); |
---|
| 1138 | + break; |
---|
| 1139 | + } |
---|
| 1140 | + return NOTIFY_DONE; |
---|
679 | 1141 | } |
---|
680 | 1142 | |
---|
681 | 1143 | static struct proto xsk_proto = { |
---|
.. | .. |
---|
712 | 1174 | if (!sock_flag(sk, SOCK_DEAD)) |
---|
713 | 1175 | return; |
---|
714 | 1176 | |
---|
715 | | - xdp_put_umem(xs->umem); |
---|
| 1177 | + if (!xp_put_pool(xs->pool)) |
---|
| 1178 | + xdp_put_umem(xs->umem, !xs->pool); |
---|
716 | 1179 | |
---|
717 | 1180 | sk_refcnt_debug_dec(sk); |
---|
718 | 1181 | } |
---|
.. | .. |
---|
720 | 1183 | static int xsk_create(struct net *net, struct socket *sock, int protocol, |
---|
721 | 1184 | int kern) |
---|
722 | 1185 | { |
---|
723 | | - struct sock *sk; |
---|
724 | 1186 | struct xdp_sock *xs; |
---|
| 1187 | + struct sock *sk; |
---|
725 | 1188 | |
---|
726 | 1189 | if (!ns_capable(net->user_ns, CAP_NET_RAW)) |
---|
727 | 1190 | return -EPERM; |
---|
.. | .. |
---|
749 | 1212 | sock_set_flag(sk, SOCK_RCU_FREE); |
---|
750 | 1213 | |
---|
751 | 1214 | xs = xdp_sk(sk); |
---|
| 1215 | + xs->state = XSK_READY; |
---|
752 | 1216 | mutex_init(&xs->mutex); |
---|
753 | | - spin_lock_init(&xs->tx_completion_lock); |
---|
| 1217 | + spin_lock_init(&xs->rx_lock); |
---|
| 1218 | + |
---|
| 1219 | + INIT_LIST_HEAD(&xs->map_list); |
---|
| 1220 | + spin_lock_init(&xs->map_list_lock); |
---|
| 1221 | + |
---|
| 1222 | + mutex_lock(&net->xdp.lock); |
---|
| 1223 | + sk_add_node_rcu(sk, &net->xdp.list); |
---|
| 1224 | + mutex_unlock(&net->xdp.lock); |
---|
754 | 1225 | |
---|
755 | 1226 | local_bh_disable(); |
---|
756 | 1227 | sock_prot_inuse_add(net, &xsk_proto, 1); |
---|
.. | .. |
---|
765 | 1236 | .owner = THIS_MODULE, |
---|
766 | 1237 | }; |
---|
767 | 1238 | |
---|
| 1239 | +static struct notifier_block xsk_netdev_notifier = { |
---|
| 1240 | + .notifier_call = xsk_notifier, |
---|
| 1241 | +}; |
---|
| 1242 | + |
---|
| 1243 | +static int __net_init xsk_net_init(struct net *net) |
---|
| 1244 | +{ |
---|
| 1245 | + mutex_init(&net->xdp.lock); |
---|
| 1246 | + INIT_HLIST_HEAD(&net->xdp.list); |
---|
| 1247 | + return 0; |
---|
| 1248 | +} |
---|
| 1249 | + |
---|
| 1250 | +static void __net_exit xsk_net_exit(struct net *net) |
---|
| 1251 | +{ |
---|
| 1252 | + WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); |
---|
| 1253 | +} |
---|
| 1254 | + |
---|
| 1255 | +static struct pernet_operations xsk_net_ops = { |
---|
| 1256 | + .init = xsk_net_init, |
---|
| 1257 | + .exit = xsk_net_exit, |
---|
| 1258 | +}; |
---|
| 1259 | + |
---|
768 | 1260 | static int __init xsk_init(void) |
---|
769 | 1261 | { |
---|
770 | | - int err; |
---|
| 1262 | + int err, cpu; |
---|
771 | 1263 | |
---|
772 | 1264 | err = proto_register(&xsk_proto, 0 /* no slab */); |
---|
773 | 1265 | if (err) |
---|
.. | .. |
---|
777 | 1269 | if (err) |
---|
778 | 1270 | goto out_proto; |
---|
779 | 1271 | |
---|
| 1272 | + err = register_pernet_subsys(&xsk_net_ops); |
---|
| 1273 | + if (err) |
---|
| 1274 | + goto out_sk; |
---|
| 1275 | + |
---|
| 1276 | + err = register_netdevice_notifier(&xsk_netdev_notifier); |
---|
| 1277 | + if (err) |
---|
| 1278 | + goto out_pernet; |
---|
| 1279 | + |
---|
| 1280 | + for_each_possible_cpu(cpu) |
---|
| 1281 | + INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); |
---|
780 | 1282 | return 0; |
---|
781 | 1283 | |
---|
| 1284 | +out_pernet: |
---|
| 1285 | + unregister_pernet_subsys(&xsk_net_ops); |
---|
| 1286 | +out_sk: |
---|
| 1287 | + sock_unregister(PF_XDP); |
---|
782 | 1288 | out_proto: |
---|
783 | 1289 | proto_unregister(&xsk_proto); |
---|
784 | 1290 | out: |
---|