| .. | .. |
|---|
| 17 | 17 | struct wg_peer; |
|---|
| 18 | 18 | struct multicore_worker; |
|---|
| 19 | 19 | struct crypt_queue; |
|---|
| 20 | +struct prev_queue; |
|---|
| 20 | 21 | struct sk_buff; |
|---|
| 21 | 22 | |
|---|
| 22 | 23 | /* queueing.c APIs: */ |
|---|
| 23 | 24 | int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, |
|---|
| 24 | | - bool multicore, unsigned int len); |
|---|
| 25 | | -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore); |
|---|
| 25 | + unsigned int len); |
|---|
| 26 | +void wg_packet_queue_free(struct crypt_queue *queue, bool purge); |
|---|
| 26 | 27 | struct multicore_worker __percpu * |
|---|
| 27 | 28 | wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); |
|---|
| 28 | 29 | |
|---|
| .. | .. |
|---|
| 93 | 94 | skb->dev = NULL; |
|---|
| 94 | 95 | #ifdef CONFIG_NET_SCHED |
|---|
| 95 | 96 | skb->tc_index = 0; |
|---|
| 96 | | - skb_reset_tc(skb); |
|---|
| 97 | 97 | #endif |
|---|
| 98 | + skb_reset_redirect(skb); |
|---|
| 98 | 99 | skb->hdr_len = skb_headroom(skb); |
|---|
| 99 | 100 | skb_reset_mac_header(skb); |
|---|
| 100 | 101 | skb_reset_network_header(skb); |
|---|
| 101 | 102 | skb_reset_transport_header(skb); |
|---|
| 102 | | - skb_probe_transport_header(skb, 0); |
|---|
| 103 | + skb_probe_transport_header(skb); |
|---|
| 103 | 104 | skb_reset_inner_headers(skb); |
|---|
| 104 | 105 | } |
|---|
| 105 | 106 | |
|---|
| .. | .. |
|---|
| 118 | 119 | return cpu; |
|---|
| 119 | 120 | } |
|---|
| 120 | 121 | |
|---|
| 121 | | -/* This function is racy, in the sense that next is unlocked, so it could return |
|---|
| 122 | | - * the same CPU twice. A race-free version of this would be to instead store an |
|---|
| 123 | | - * atomic sequence number, do an increment-and-return, and then iterate through |
|---|
| 124 | | - * every possible CPU until we get to that index -- choose_cpu. However that's |
|---|
| 125 | | - * a bit slower, and it doesn't seem like this potential race actually |
|---|
| 126 | | - * introduces any performance loss, so we live with it. |
|---|
| 122 | +/* This function is racy, in the sense that it's called while last_cpu is |
|---|
| 123 | + * unlocked, so it could return the same CPU twice. Adding locking or using |
|---|
| 124 | + * atomic sequence numbers is slower though, and the consequences of racing are |
|---|
| 125 | + * harmless, so live with it. |
|---|
| 127 | 126 | */ |
|---|
| 128 | | -static inline int wg_cpumask_next_online(int *next) |
|---|
| 127 | +static inline int wg_cpumask_next_online(int *last_cpu) |
|---|
| 129 | 128 | { |
|---|
| 130 | | - int cpu = *next; |
|---|
| 131 | | - |
|---|
| 132 | | - while (unlikely(!cpumask_test_cpu(cpu, cpu_online_mask))) |
|---|
| 133 | | - cpu = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits; |
|---|
| 134 | | - *next = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits; |
|---|
| 129 | + int cpu = cpumask_next(*last_cpu, cpu_online_mask); |
|---|
| 130 | + if (cpu >= nr_cpu_ids) |
|---|
| 131 | + cpu = cpumask_first(cpu_online_mask); |
|---|
| 132 | + *last_cpu = cpu; |
|---|
| 135 | 133 | return cpu; |
|---|
| 136 | 134 | } |
|---|
| 137 | 135 | |
|---|
| 136 | +void wg_prev_queue_init(struct prev_queue *queue); |
|---|
| 137 | + |
|---|
| 138 | +/* Multi producer */ |
|---|
| 139 | +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); |
|---|
| 140 | + |
|---|
| 141 | +/* Single consumer */ |
|---|
| 142 | +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); |
|---|
| 143 | + |
|---|
| 144 | +/* Single consumer */ |
|---|
| 145 | +static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) |
|---|
| 146 | +{ |
|---|
| 147 | + if (queue->peeked) |
|---|
| 148 | + return queue->peeked; |
|---|
| 149 | + queue->peeked = wg_prev_queue_dequeue(queue); |
|---|
| 150 | + return queue->peeked; |
|---|
| 151 | +} |
|---|
| 152 | + |
|---|
| 153 | +/* Single consumer */ |
|---|
| 154 | +static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) |
|---|
| 155 | +{ |
|---|
| 156 | + queue->peeked = NULL; |
|---|
| 157 | +} |
|---|
| 158 | + |
|---|
| 138 | 159 | static inline int wg_queue_enqueue_per_device_and_peer( |
|---|
| 139 | | - struct crypt_queue *device_queue, struct crypt_queue *peer_queue, |
|---|
| 140 | | - struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu) |
|---|
| 160 | + struct crypt_queue *device_queue, struct prev_queue *peer_queue, |
|---|
| 161 | + struct sk_buff *skb, struct workqueue_struct *wq) |
|---|
| 141 | 162 | { |
|---|
| 142 | 163 | int cpu; |
|---|
| 143 | 164 | |
|---|
| .. | .. |
|---|
| 145 | 166 | /* We first queue this up for the peer ingestion, but the consumer |
|---|
| 146 | 167 | * will wait for the state to change to CRYPTED or DEAD before. |
|---|
| 147 | 168 | */ |
|---|
| 148 | | - if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb))) |
|---|
| 169 | + if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) |
|---|
| 149 | 170 | return -ENOSPC; |
|---|
| 171 | + |
|---|
| 150 | 172 | /* Then we queue it up in the device queue, which consumes the |
|---|
| 151 | 173 | * packet as soon as it can. |
|---|
| 152 | 174 | */ |
|---|
| 153 | | - cpu = wg_cpumask_next_online(next_cpu); |
|---|
| 175 | + cpu = wg_cpumask_next_online(&device_queue->last_cpu); |
|---|
| 154 | 176 | if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb))) |
|---|
| 155 | 177 | return -EPIPE; |
|---|
| 156 | 178 | queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work); |
|---|
| 157 | 179 | return 0; |
|---|
| 158 | 180 | } |
|---|
| 159 | 181 | |
|---|
| 160 | | -static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, |
|---|
| 161 | | - struct sk_buff *skb, |
|---|
| 162 | | - enum packet_state state) |
|---|
| 182 | +static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) |
|---|
| 163 | 183 | { |
|---|
| 164 | 184 | /* We take a reference, because as soon as we call atomic_set, the |
|---|
| 165 | 185 | * peer can be freed from below us. |
|---|
| .. | .. |
|---|
| 167 | 187 | struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); |
|---|
| 168 | 188 | |
|---|
| 169 | 189 | atomic_set_release(&PACKET_CB(skb)->state, state); |
|---|
| 170 | | - queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, |
|---|
| 171 | | - peer->internal_id), |
|---|
| 172 | | - peer->device->packet_crypt_wq, &queue->work); |
|---|
| 190 | + queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), |
|---|
| 191 | + peer->device->packet_crypt_wq, &peer->transmit_packet_work); |
|---|
| 173 | 192 | wg_peer_put(peer); |
|---|
| 174 | 193 | } |
|---|
| 175 | 194 | |
|---|
| 176 | | -static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb, |
|---|
| 177 | | - enum packet_state state) |
|---|
| 195 | +static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) |
|---|
| 178 | 196 | { |
|---|
| 179 | 197 | /* We take a reference, because as soon as we call atomic_set, the |
|---|
| 180 | 198 | * peer can be freed from below us. |
|---|