hc
2024-05-11 04dd17822334871b23ea2862f7798fb0e0007777
kernel/net/ipv4/inet_fragment.c
....@@ -1,10 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * inet fragments management
3
- *
4
- * This program is free software; you can redistribute it and/or
5
- * modify it under the terms of the GNU General Public License
6
- * as published by the Free Software Foundation; either version
7
- * 2 of the License, or (at your option) any later version.
84 *
95 * Authors: Pavel Emelyanov <xemul@openvz.org>
106 * Started as consolidation of ipv4/ip_fragment.c,
....@@ -110,48 +106,90 @@
110106 if (!f->frags_cachep)
111107 return -ENOMEM;
112108
109
+ refcount_set(&f->refcnt, 1);
110
+ init_completion(&f->completion);
113111 return 0;
114112 }
115113 EXPORT_SYMBOL(inet_frags_init);
116114
117115 void inet_frags_fini(struct inet_frags *f)
118116 {
119
- /* We must wait that all inet_frag_destroy_rcu() have completed. */
120
- rcu_barrier();
117
+ if (refcount_dec_and_test(&f->refcnt))
118
+ complete(&f->completion);
119
+
120
+ wait_for_completion(&f->completion);
121121
122122 kmem_cache_destroy(f->frags_cachep);
123123 f->frags_cachep = NULL;
124124 }
125125 EXPORT_SYMBOL(inet_frags_fini);
126126
127
+/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
127128 static void inet_frags_free_cb(void *ptr, void *arg)
128129 {
129130 struct inet_frag_queue *fq = ptr;
131
+ int count;
130132
131
- /* If we can not cancel the timer, it means this frag_queue
132
- * is already disappearing, we have nothing to do.
133
- * Otherwise, we own a refcount until the end of this function.
134
- */
135
- if (!del_timer(&fq->timer))
136
- return;
133
+ count = del_timer_sync(&fq->timer) ? 1 : 0;
137134
138135 spin_lock_bh(&fq->lock);
139136 if (!(fq->flags & INET_FRAG_COMPLETE)) {
140137 fq->flags |= INET_FRAG_COMPLETE;
141
- refcount_dec(&fq->refcnt);
138
+ count++;
139
+ } else if (fq->flags & INET_FRAG_HASH_DEAD) {
140
+ count++;
142141 }
143142 spin_unlock_bh(&fq->lock);
144143
145
- inet_frag_put(fq);
144
+ if (refcount_sub_and_test(count, &fq->refcnt))
145
+ inet_frag_destroy(fq);
146146 }
147147
148
-void inet_frags_exit_net(struct netns_frags *nf)
148
+static void fqdir_work_fn(struct work_struct *work)
149149 {
150
- nf->high_thresh = 0; /* prevent creation of new frags */
150
+ struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
151
+ struct inet_frags *f = fqdir->f;
151152
152
- rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
153
+ rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
154
+
155
+ /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
156
+ * have completed, since they need to dereference fqdir.
157
+ * Would it not be nice to have kfree_rcu_barrier() ? :)
158
+ */
159
+ rcu_barrier();
160
+
161
+ if (refcount_dec_and_test(&f->refcnt))
162
+ complete(&f->completion);
163
+
164
+ kfree(fqdir);
153165 }
154
-EXPORT_SYMBOL(inet_frags_exit_net);
166
+
167
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
168
+{
169
+ struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
170
+ int res;
171
+
172
+ if (!fqdir)
173
+ return -ENOMEM;
174
+ fqdir->f = f;
175
+ fqdir->net = net;
176
+ res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
177
+ if (res < 0) {
178
+ kfree(fqdir);
179
+ return res;
180
+ }
181
+ refcount_inc(&f->refcnt);
182
+ *fqdirp = fqdir;
183
+ return 0;
184
+}
185
+EXPORT_SYMBOL(fqdir_init);
186
+
187
+void fqdir_exit(struct fqdir *fqdir)
188
+{
189
+ INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
190
+ queue_work(system_wq, &fqdir->destroy_work);
191
+}
192
+EXPORT_SYMBOL(fqdir_exit);
155193
156194 void inet_frag_kill(struct inet_frag_queue *fq)
157195 {
....@@ -159,11 +197,23 @@
159197 refcount_dec(&fq->refcnt);
160198
161199 if (!(fq->flags & INET_FRAG_COMPLETE)) {
162
- struct netns_frags *nf = fq->net;
200
+ struct fqdir *fqdir = fq->fqdir;
163201
164202 fq->flags |= INET_FRAG_COMPLETE;
165
- rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
166
- refcount_dec(&fq->refcnt);
203
+ rcu_read_lock();
204
+ /* The RCU read lock provides a memory barrier
205
+ * guaranteeing that if fqdir->dead is false then
206
+ * the hash table destruction will not start until
207
+ * after we unlock. Paired with fqdir_pre_exit().
208
+ */
209
+ if (!READ_ONCE(fqdir->dead)) {
210
+ rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
211
+ fqdir->f->rhash_params);
212
+ refcount_dec(&fq->refcnt);
213
+ } else {
214
+ fq->flags |= INET_FRAG_HASH_DEAD;
215
+ }
216
+ rcu_read_unlock();
167217 }
168218 }
169219 EXPORT_SYMBOL(inet_frag_kill);
....@@ -172,7 +222,7 @@
172222 {
173223 struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
174224 rcu);
175
- struct inet_frags *f = q->net->f;
225
+ struct inet_frags *f = q->fqdir->f;
176226
177227 if (f->destructor)
178228 f->destructor(q);
....@@ -203,8 +253,7 @@
203253
204254 void inet_frag_destroy(struct inet_frag_queue *q)
205255 {
206
- struct sk_buff *fp;
207
- struct netns_frags *nf;
256
+ struct fqdir *fqdir;
208257 unsigned int sum, sum_truesize = 0;
209258 struct inet_frags *f;
210259
....@@ -212,29 +261,18 @@
212261 WARN_ON(del_timer(&q->timer) != 0);
213262
214263 /* Release all fragment data. */
215
- fp = q->fragments;
216
- nf = q->net;
217
- f = nf->f;
218
- if (fp) {
219
- do {
220
- struct sk_buff *xp = fp->next;
221
-
222
- sum_truesize += fp->truesize;
223
- kfree_skb(fp);
224
- fp = xp;
225
- } while (fp);
226
- } else {
227
- sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
228
- }
264
+ fqdir = q->fqdir;
265
+ f = fqdir->f;
266
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
229267 sum = sum_truesize + f->qsize;
230268
231269 call_rcu(&q->rcu, inet_frag_destroy_rcu);
232270
233
- sub_frag_mem_limit(nf, sum);
271
+ sub_frag_mem_limit(fqdir, sum);
234272 }
235273 EXPORT_SYMBOL(inet_frag_destroy);
236274
237
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
275
+static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
238276 struct inet_frags *f,
239277 void *arg)
240278 {
....@@ -244,9 +282,9 @@
244282 if (!q)
245283 return NULL;
246284
247
- q->net = nf;
285
+ q->fqdir = fqdir;
248286 f->constructor(q, arg);
249
- add_frag_mem_limit(nf, f->qsize);
287
+ add_frag_mem_limit(fqdir, f->qsize);
250288
251289 timer_setup(&q->timer, f->frag_expire, 0);
252290 spin_lock_init(&q->lock);
....@@ -255,21 +293,21 @@
255293 return q;
256294 }
257295
258
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
296
+static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
259297 void *arg,
260298 struct inet_frag_queue **prev)
261299 {
262
- struct inet_frags *f = nf->f;
300
+ struct inet_frags *f = fqdir->f;
263301 struct inet_frag_queue *q;
264302
265
- q = inet_frag_alloc(nf, f, arg);
303
+ q = inet_frag_alloc(fqdir, f, arg);
266304 if (!q) {
267305 *prev = ERR_PTR(-ENOMEM);
268306 return NULL;
269307 }
270
- mod_timer(&q->timer, jiffies + nf->timeout);
308
+ mod_timer(&q->timer, jiffies + fqdir->timeout);
271309
272
- *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
310
+ *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
273311 &q->node, f->rhash_params);
274312 if (*prev) {
275313 q->flags |= INET_FRAG_COMPLETE;
....@@ -281,19 +319,21 @@
281319 }
282320
283321 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
284
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
322
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
285323 {
324
+ /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
325
+ long high_thresh = READ_ONCE(fqdir->high_thresh);
286326 struct inet_frag_queue *fq = NULL, *prev;
287327
288
- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
328
+ if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
289329 return NULL;
290330
291331 rcu_read_lock();
292332
293
- prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
333
+ prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
294334 if (!prev)
295
- fq = inet_frag_create(nf, key, &prev);
296
- if (prev && !IS_ERR(prev)) {
335
+ fq = inet_frag_create(fqdir, key, &prev);
336
+ if (!IS_ERR_OR_NULL(prev)) {
297337 fq = prev;
298338 if (!refcount_inc_not_zero(&fq->refcnt))
299339 fq = NULL;
....@@ -403,7 +443,7 @@
403443
404444 delta += head->truesize;
405445 if (delta)
406
- add_frag_mem_limit(q->net, delta);
446
+ add_frag_mem_limit(q->fqdir, delta);
407447
408448 /* If the first fragment is fragmented itself, we split
409449 * it to two chunks: the first with data and paged part
....@@ -425,7 +465,7 @@
425465 head->truesize += clone->truesize;
426466 clone->csum = 0;
427467 clone->ip_summed = head->ip_summed;
428
- add_frag_mem_limit(q->net, clone->truesize);
468
+ add_frag_mem_limit(q->fqdir, clone->truesize);
429469 skb_shinfo(head)->frag_list = clone;
430470 nextp = &clone->next;
431471 } else {
....@@ -437,11 +477,12 @@
437477 EXPORT_SYMBOL(inet_frag_reasm_prepare);
438478
439479 void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
440
- void *reasm_data)
480
+ void *reasm_data, bool try_coalesce)
441481 {
442482 struct sk_buff **nextp = (struct sk_buff **)reasm_data;
443483 struct rb_node *rbn;
444484 struct sk_buff *fp;
485
+ int sum_truesize;
445486
446487 skb_push(head, head->data - skb_network_header(head));
447488
....@@ -449,25 +490,41 @@
449490 fp = FRAG_CB(head)->next_frag;
450491 rbn = rb_next(&head->rbnode);
451492 rb_erase(&head->rbnode, &q->rb_fragments);
493
+
494
+ sum_truesize = head->truesize;
452495 while (rbn || fp) {
453496 /* fp points to the next sk_buff in the current run;
454497 * rbn points to the next run.
455498 */
456499 /* Go through the current run. */
457500 while (fp) {
458
- *nextp = fp;
459
- nextp = &fp->next;
460
- fp->prev = NULL;
461
- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
462
- fp->sk = NULL;
463
- head->data_len += fp->len;
464
- head->len += fp->len;
501
+ struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
502
+ bool stolen;
503
+ int delta;
504
+
505
+ sum_truesize += fp->truesize;
465506 if (head->ip_summed != fp->ip_summed)
466507 head->ip_summed = CHECKSUM_NONE;
467508 else if (head->ip_summed == CHECKSUM_COMPLETE)
468509 head->csum = csum_add(head->csum, fp->csum);
469
- head->truesize += fp->truesize;
470
- fp = FRAG_CB(fp)->next_frag;
510
+
511
+ if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
512
+ &delta)) {
513
+ kfree_skb_partial(fp, stolen);
514
+ } else {
515
+ fp->prev = NULL;
516
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
517
+ fp->sk = NULL;
518
+
519
+ head->data_len += fp->len;
520
+ head->len += fp->len;
521
+ head->truesize += fp->truesize;
522
+
523
+ *nextp = fp;
524
+ nextp = &fp->next;
525
+ }
526
+
527
+ fp = next_frag;
471528 }
472529 /* Move to the next run. */
473530 if (rbn) {
....@@ -478,7 +535,7 @@
478535 rbn = rbnext;
479536 }
480537 }
481
- sub_frag_mem_limit(q->net, head->truesize);
538
+ sub_frag_mem_limit(q->fqdir, sum_truesize);
482539
483540 *nextp = NULL;
484541 skb_mark_not_on_list(head);
....@@ -489,30 +546,24 @@
489546
490547 struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
491548 {
492
- struct sk_buff *head;
549
+ struct sk_buff *head, *skb;
493550
494
- if (q->fragments) {
495
- head = q->fragments;
496
- q->fragments = head->next;
497
- } else {
498
- struct sk_buff *skb;
551
+ head = skb_rb_first(&q->rb_fragments);
552
+ if (!head)
553
+ return NULL;
554
+ skb = FRAG_CB(head)->next_frag;
555
+ if (skb)
556
+ rb_replace_node(&head->rbnode, &skb->rbnode,
557
+ &q->rb_fragments);
558
+ else
559
+ rb_erase(&head->rbnode, &q->rb_fragments);
560
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
561
+ barrier();
499562
500
- head = skb_rb_first(&q->rb_fragments);
501
- if (!head)
502
- return NULL;
503
- skb = FRAG_CB(head)->next_frag;
504
- if (skb)
505
- rb_replace_node(&head->rbnode, &skb->rbnode,
506
- &q->rb_fragments);
507
- else
508
- rb_erase(&head->rbnode, &q->rb_fragments);
509
- memset(&head->rbnode, 0, sizeof(head->rbnode));
510
- barrier();
511
- }
512563 if (head == q->fragments_tail)
513564 q->fragments_tail = NULL;
514565
515
- sub_frag_mem_limit(q->net, head->truesize);
566
+ sub_frag_mem_limit(q->fqdir, head->truesize);
516567
517568 return head;
518569 }