hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/mm/mmu_notifier.c
....@@ -1,12 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/mm/mmu_notifier.c
34 *
45 * Copyright (C) 2008 Qumranet, Inc.
56 * Copyright (C) 2008 SGI
67 * Christoph Lameter <cl@linux.com>
7
- *
8
- * This work is licensed under the terms of the GNU GPL, version 2. See
9
- * the COPYING file in the top-level directory.
108 */
119
1210 #include <linux/rculist.h>
....@@ -14,6 +12,7 @@
1412 #include <linux/export.h>
1513 #include <linux/mm.h>
1614 #include <linux/err.h>
15
+#include <linux/interval_tree.h>
1716 #include <linux/srcu.h>
1817 #include <linux/rcupdate.h>
1918 #include <linux/sched.h>
....@@ -23,24 +22,273 @@
2322 /* global SRCU for all MMs */
2423 DEFINE_STATIC_SRCU(srcu);
2524
26
-/*
27
- * This function allows mmu_notifier::release callback to delay a call to
28
- * a function that will free appropriate resources. The function must be
29
- * quick and must not block.
30
- */
31
-void mmu_notifier_call_srcu(struct rcu_head *rcu,
32
- void (*func)(struct rcu_head *rcu))
33
-{
34
- call_srcu(&srcu, rcu, func);
35
-}
36
-EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
25
+#ifdef CONFIG_LOCKDEP
26
+struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
27
+ .name = "mmu_notifier_invalidate_range_start"
28
+};
29
+#endif
3730
38
-void mmu_notifier_synchronize(void)
31
+/*
32
+ * The mmu_notifier_subscriptions structure is allocated and installed in
33
+ * mm->notifier_subscriptions inside the mm_take_all_locks() protected
34
+ * critical section and it's released only when mm_count reaches zero
35
+ * in mmdrop().
36
+ */
37
+struct mmu_notifier_subscriptions {
38
+ /*
39
+ * WARNING: hdr should be the first member of this structure
40
+ * so that it can be typecasted into mmu_notifier_subscriptions_hdr.
41
+ * This is required to avoid KMI CRC breakage.
42
+ */
43
+ struct mmu_notifier_subscriptions_hdr hdr;
44
+ /* all mmu notifiers registered in this mm are queued in this list */
45
+ struct hlist_head list;
46
+ bool has_itree;
47
+ /* to serialize the list modifications and hlist_unhashed */
48
+ spinlock_t lock;
49
+ unsigned long invalidate_seq;
50
+ unsigned long active_invalidate_ranges;
51
+ struct rb_root_cached itree;
52
+ wait_queue_head_t wq;
53
+ struct hlist_head deferred_list;
54
+};
55
+
56
+/*
57
+ * This is a collision-retry read-side/write-side 'lock', a lot like a
58
+ * seqcount, however this allows multiple write-sides to hold it at
59
+ * once. Conceptually the write side is protecting the values of the PTEs in
60
+ * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
61
+ * writer exists.
62
+ *
63
+ * Note that the core mm creates nested invalidate_range_start()/end() regions
64
+ * within the same thread, and runs invalidate_range_start()/end() in parallel
65
+ * on multiple CPUs. This is designed to not reduce concurrency or block
66
+ * progress on the mm side.
67
+ *
68
+ * As a secondary function, holding the full write side also serves to prevent
69
+ * writers for the itree, this is an optimization to avoid extra locking
70
+ * during invalidate_range_start/end notifiers.
71
+ *
72
+ * The write side has two states, fully excluded:
73
+ * - mm->active_invalidate_ranges != 0
74
+ * - subscriptions->invalidate_seq & 1 == True (odd)
75
+ * - some range on the mm_struct is being invalidated
76
+ * - the itree is not allowed to change
77
+ *
78
+ * And partially excluded:
79
+ * - mm->active_invalidate_ranges != 0
80
+ * - subscriptions->invalidate_seq & 1 == False (even)
81
+ * - some range on the mm_struct is being invalidated
82
+ * - the itree is allowed to change
83
+ *
84
+ * Operations on notifier_subscriptions->invalidate_seq (under spinlock):
85
+ * seq |= 1 # Begin writing
86
+ * seq++ # Release the writing state
87
+ * seq & 1 # True if a writer exists
88
+ *
89
+ * The later state avoids some expensive work on inv_end in the common case of
90
+ * no mmu_interval_notifier monitoring the VA.
91
+ */
92
+static bool
93
+mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
3994 {
40
- /* Wait for any running method to finish. */
41
- srcu_barrier(&srcu);
95
+ lockdep_assert_held(&subscriptions->lock);
96
+ return subscriptions->invalidate_seq & 1;
4297 }
43
-EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
98
+
99
+static struct mmu_interval_notifier *
100
+mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
101
+ const struct mmu_notifier_range *range,
102
+ unsigned long *seq)
103
+{
104
+ struct interval_tree_node *node;
105
+ struct mmu_interval_notifier *res = NULL;
106
+
107
+ spin_lock(&subscriptions->lock);
108
+ subscriptions->active_invalidate_ranges++;
109
+ node = interval_tree_iter_first(&subscriptions->itree, range->start,
110
+ range->end - 1);
111
+ if (node) {
112
+ subscriptions->invalidate_seq |= 1;
113
+ res = container_of(node, struct mmu_interval_notifier,
114
+ interval_tree);
115
+ }
116
+
117
+ *seq = subscriptions->invalidate_seq;
118
+ spin_unlock(&subscriptions->lock);
119
+ return res;
120
+}
121
+
122
+static struct mmu_interval_notifier *
123
+mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
124
+ const struct mmu_notifier_range *range)
125
+{
126
+ struct interval_tree_node *node;
127
+
128
+ node = interval_tree_iter_next(&interval_sub->interval_tree,
129
+ range->start, range->end - 1);
130
+ if (!node)
131
+ return NULL;
132
+ return container_of(node, struct mmu_interval_notifier, interval_tree);
133
+}
134
+
135
+static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
136
+{
137
+ struct mmu_interval_notifier *interval_sub;
138
+ struct hlist_node *next;
139
+
140
+ spin_lock(&subscriptions->lock);
141
+ if (--subscriptions->active_invalidate_ranges ||
142
+ !mn_itree_is_invalidating(subscriptions)) {
143
+ spin_unlock(&subscriptions->lock);
144
+ return;
145
+ }
146
+
147
+ /* Make invalidate_seq even */
148
+ subscriptions->invalidate_seq++;
149
+
150
+ /*
151
+ * The inv_end incorporates a deferred mechanism like rtnl_unlock().
152
+ * Adds and removes are queued until the final inv_end happens then
153
+ * they are progressed. This arrangement for tree updates is used to
154
+ * avoid using a blocking lock during invalidate_range_start.
155
+ */
156
+ hlist_for_each_entry_safe(interval_sub, next,
157
+ &subscriptions->deferred_list,
158
+ deferred_item) {
159
+ if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
160
+ interval_tree_insert(&interval_sub->interval_tree,
161
+ &subscriptions->itree);
162
+ else
163
+ interval_tree_remove(&interval_sub->interval_tree,
164
+ &subscriptions->itree);
165
+ hlist_del(&interval_sub->deferred_item);
166
+ }
167
+ spin_unlock(&subscriptions->lock);
168
+
169
+ wake_up_all(&subscriptions->wq);
170
+}
171
+
172
+/**
173
+ * mmu_interval_read_begin - Begin a read side critical section against a VA
174
+ * range
175
+ * @interval_sub: The interval subscription
176
+ *
177
+ * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
178
+ * collision-retry scheme similar to seqcount for the VA range under
179
+ * subscription. If the mm invokes invalidation during the critical section
180
+ * then mmu_interval_read_retry() will return true.
181
+ *
182
+ * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
183
+ * require a blocking context. The critical region formed by this can sleep,
184
+ * and the required 'user_lock' can also be a sleeping lock.
185
+ *
186
+ * The caller is required to provide a 'user_lock' to serialize both teardown
187
+ * and setup.
188
+ *
189
+ * The return value should be passed to mmu_interval_read_retry().
190
+ */
191
+unsigned long
192
+mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
193
+{
194
+ struct mmu_notifier_subscriptions *subscriptions =
195
+ interval_sub->mm->notifier_subscriptions;
196
+ unsigned long seq;
197
+ bool is_invalidating;
198
+
199
+ /*
200
+ * If the subscription has a different seq value under the user_lock
201
+ * than we started with then it has collided.
202
+ *
203
+ * If the subscription currently has the same seq value as the
204
+ * subscriptions seq, then it is currently between
205
+ * invalidate_start/end and is colliding.
206
+ *
207
+ * The locking looks broadly like this:
208
+ * mn_tree_invalidate_start(): mmu_interval_read_begin():
209
+ * spin_lock
210
+ * seq = READ_ONCE(interval_sub->invalidate_seq);
211
+ * seq == subs->invalidate_seq
212
+ * spin_unlock
213
+ * spin_lock
214
+ * seq = ++subscriptions->invalidate_seq
215
+ * spin_unlock
216
+ * op->invalidate_range():
217
+ * user_lock
218
+ * mmu_interval_set_seq()
219
+ * interval_sub->invalidate_seq = seq
220
+ * user_unlock
221
+ *
222
+ * [Required: mmu_interval_read_retry() == true]
223
+ *
224
+ * mn_itree_inv_end():
225
+ * spin_lock
226
+ * seq = ++subscriptions->invalidate_seq
227
+ * spin_unlock
228
+ *
229
+ * user_lock
230
+ * mmu_interval_read_retry():
231
+ * interval_sub->invalidate_seq != seq
232
+ * user_unlock
233
+ *
234
+ * Barriers are not needed here as any races here are closed by an
235
+ * eventual mmu_interval_read_retry(), which provides a barrier via the
236
+ * user_lock.
237
+ */
238
+ spin_lock(&subscriptions->lock);
239
+ /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
240
+ seq = READ_ONCE(interval_sub->invalidate_seq);
241
+ is_invalidating = seq == subscriptions->invalidate_seq;
242
+ spin_unlock(&subscriptions->lock);
243
+
244
+ /*
245
+ * interval_sub->invalidate_seq must always be set to an odd value via
246
+ * mmu_interval_set_seq() using the provided cur_seq from
247
+ * mn_itree_inv_start_range(). This ensures that if seq does wrap we
248
+ * will always clear the below sleep in some reasonable time as
249
+ * subscriptions->invalidate_seq is even in the idle state.
250
+ */
251
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
252
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
253
+ if (is_invalidating)
254
+ wait_event(subscriptions->wq,
255
+ READ_ONCE(subscriptions->invalidate_seq) != seq);
256
+
257
+ /*
258
+ * Notice that mmu_interval_read_retry() can already be true at this
259
+ * point, avoiding loops here allows the caller to provide a global
260
+ * time bound.
261
+ */
262
+
263
+ return seq;
264
+}
265
+EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
266
+
267
+static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
268
+ struct mm_struct *mm)
269
+{
270
+ struct mmu_notifier_range range = {
271
+ .flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
272
+ .event = MMU_NOTIFY_RELEASE,
273
+ .mm = mm,
274
+ .start = 0,
275
+ .end = ULONG_MAX,
276
+ };
277
+ struct mmu_interval_notifier *interval_sub;
278
+ unsigned long cur_seq;
279
+ bool ret;
280
+
281
+ for (interval_sub =
282
+ mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
283
+ interval_sub;
284
+ interval_sub = mn_itree_inv_next(interval_sub, &range)) {
285
+ ret = interval_sub->ops->invalidate(interval_sub, &range,
286
+ cur_seq);
287
+ WARN_ON(!ret);
288
+ }
289
+
290
+ mn_itree_inv_end(subscriptions);
291
+}
44292
45293 /*
46294 * This function can't run concurrently against mmu_notifier_register
....@@ -49,14 +297,15 @@
49297 * in parallel despite there being no task using this mm any more,
50298 * through the vmas outside of the exit_mmap context, such as with
51299 * vmtruncate. This serializes against mmu_notifier_unregister with
52
- * the mmu_notifier_mm->lock in addition to SRCU and it serializes
53
- * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
300
+ * the notifier_subscriptions->lock in addition to SRCU and it serializes
301
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions
54302 * can't go away from under us as exit_mmap holds an mm_count pin
55303 * itself.
56304 */
57
-void __mmu_notifier_release(struct mm_struct *mm)
305
+static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
306
+ struct mm_struct *mm)
58307 {
59
- struct mmu_notifier *mn;
308
+ struct mmu_notifier *subscription;
60309 int id;
61310
62311 /*
....@@ -64,30 +313,30 @@
64313 * ->release returns.
65314 */
66315 id = srcu_read_lock(&srcu);
67
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
316
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
317
+ srcu_read_lock_held(&srcu))
68318 /*
69319 * If ->release runs before mmu_notifier_unregister it must be
70320 * handled, as it's the only way for the driver to flush all
71321 * existing sptes and stop the driver from establishing any more
72322 * sptes before all the pages in the mm are freed.
73323 */
74
- if (mn->ops->release)
75
- mn->ops->release(mn, mm);
324
+ if (subscription->ops->release)
325
+ subscription->ops->release(subscription, mm);
76326
77
- spin_lock(&mm->mmu_notifier_mm->lock);
78
- while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
79
- mn = hlist_entry(mm->mmu_notifier_mm->list.first,
80
- struct mmu_notifier,
81
- hlist);
327
+ spin_lock(&subscriptions->lock);
328
+ while (unlikely(!hlist_empty(&subscriptions->list))) {
329
+ subscription = hlist_entry(subscriptions->list.first,
330
+ struct mmu_notifier, hlist);
82331 /*
83332 * We arrived before mmu_notifier_unregister so
84333 * mmu_notifier_unregister will do nothing other than to wait
85334 * for ->release to finish and for mmu_notifier_unregister to
86335 * return.
87336 */
88
- hlist_del_init_rcu(&mn->hlist);
337
+ hlist_del_init_rcu(&subscription->hlist);
89338 }
90
- spin_unlock(&mm->mmu_notifier_mm->lock);
339
+ spin_unlock(&subscriptions->lock);
91340 srcu_read_unlock(&srcu, id);
92341
93342 /*
....@@ -96,10 +345,22 @@
96345 * until the ->release method returns, if it was invoked by
97346 * mmu_notifier_unregister.
98347 *
99
- * The mmu_notifier_mm can't go away from under us because one mm_count
100
- * is held by exit_mmap.
348
+ * The notifier_subscriptions can't go away from under us because
349
+ * one mm_count is held by exit_mmap.
101350 */
102351 synchronize_srcu(&srcu);
352
+}
353
+
354
+void __mmu_notifier_release(struct mm_struct *mm)
355
+{
356
+ struct mmu_notifier_subscriptions *subscriptions =
357
+ mm->notifier_subscriptions;
358
+
359
+ if (subscriptions->has_itree)
360
+ mn_itree_release(subscriptions, mm);
361
+
362
+ if (!hlist_empty(&subscriptions->list))
363
+ mn_hlist_release(subscriptions, mm);
103364 }
104365
105366 /*
....@@ -111,13 +372,16 @@
111372 unsigned long start,
112373 unsigned long end)
113374 {
114
- struct mmu_notifier *mn;
375
+ struct mmu_notifier *subscription;
115376 int young = 0, id;
116377
117378 id = srcu_read_lock(&srcu);
118
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
119
- if (mn->ops->clear_flush_young)
120
- young |= mn->ops->clear_flush_young(mn, mm, start, end);
379
+ hlist_for_each_entry_rcu(subscription,
380
+ &mm->notifier_subscriptions->list, hlist,
381
+ srcu_read_lock_held(&srcu)) {
382
+ if (subscription->ops->clear_flush_young)
383
+ young |= subscription->ops->clear_flush_young(
384
+ subscription, mm, start, end);
121385 }
122386 srcu_read_unlock(&srcu, id);
123387
....@@ -128,13 +392,16 @@
128392 unsigned long start,
129393 unsigned long end)
130394 {
131
- struct mmu_notifier *mn;
395
+ struct mmu_notifier *subscription;
132396 int young = 0, id;
133397
134398 id = srcu_read_lock(&srcu);
135
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
136
- if (mn->ops->clear_young)
137
- young |= mn->ops->clear_young(mn, mm, start, end);
399
+ hlist_for_each_entry_rcu(subscription,
400
+ &mm->notifier_subscriptions->list, hlist,
401
+ srcu_read_lock_held(&srcu)) {
402
+ if (subscription->ops->clear_young)
403
+ young |= subscription->ops->clear_young(subscription,
404
+ mm, start, end);
138405 }
139406 srcu_read_unlock(&srcu, id);
140407
....@@ -144,13 +411,16 @@
144411 int __mmu_notifier_test_young(struct mm_struct *mm,
145412 unsigned long address)
146413 {
147
- struct mmu_notifier *mn;
414
+ struct mmu_notifier *subscription;
148415 int young = 0, id;
149416
150417 id = srcu_read_lock(&srcu);
151
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
152
- if (mn->ops->test_young) {
153
- young = mn->ops->test_young(mn, mm, address);
418
+ hlist_for_each_entry_rcu(subscription,
419
+ &mm->notifier_subscriptions->list, hlist,
420
+ srcu_read_lock_held(&srcu)) {
421
+ if (subscription->ops->test_young) {
422
+ young = subscription->ops->test_young(subscription, mm,
423
+ address);
154424 if (young)
155425 break;
156426 }
....@@ -163,53 +433,138 @@
163433 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
164434 pte_t pte)
165435 {
166
- struct mmu_notifier *mn;
436
+ struct mmu_notifier *subscription;
167437 int id;
168438
169439 id = srcu_read_lock(&srcu);
170
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
171
- if (mn->ops->change_pte)
172
- mn->ops->change_pte(mn, mm, address, pte);
440
+ hlist_for_each_entry_rcu(subscription,
441
+ &mm->notifier_subscriptions->list, hlist,
442
+ srcu_read_lock_held(&srcu)) {
443
+ if (subscription->ops->change_pte)
444
+ subscription->ops->change_pte(subscription, mm, address,
445
+ pte);
173446 }
174447 srcu_read_unlock(&srcu, id);
175448 }
176449
177
-int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
178
- unsigned long start, unsigned long end,
179
- bool blockable)
450
+static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
451
+ const struct mmu_notifier_range *range)
180452 {
181
- struct mmu_notifier *mn;
453
+ struct mmu_interval_notifier *interval_sub;
454
+ unsigned long cur_seq;
455
+
456
+ for (interval_sub =
457
+ mn_itree_inv_start_range(subscriptions, range, &cur_seq);
458
+ interval_sub;
459
+ interval_sub = mn_itree_inv_next(interval_sub, range)) {
460
+ bool ret;
461
+
462
+ ret = interval_sub->ops->invalidate(interval_sub, range,
463
+ cur_seq);
464
+ if (!ret) {
465
+ if (WARN_ON(mmu_notifier_range_blockable(range)))
466
+ continue;
467
+ goto out_would_block;
468
+ }
469
+ }
470
+ return 0;
471
+
472
+out_would_block:
473
+ /*
474
+ * On -EAGAIN the non-blocking caller is not allowed to call
475
+ * invalidate_range_end()
476
+ */
477
+ mn_itree_inv_end(subscriptions);
478
+ return -EAGAIN;
479
+}
480
+
481
+static int mn_hlist_invalidate_range_start(
482
+ struct mmu_notifier_subscriptions *subscriptions,
483
+ struct mmu_notifier_range *range)
484
+{
485
+ struct mmu_notifier *subscription;
182486 int ret = 0;
183487 int id;
184488
185489 id = srcu_read_lock(&srcu);
186
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
187
- if (mn->ops->invalidate_range_start) {
188
- int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
490
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
491
+ srcu_read_lock_held(&srcu)) {
492
+ const struct mmu_notifier_ops *ops = subscription->ops;
493
+
494
+ if (ops->invalidate_range_start) {
495
+ int _ret;
496
+
497
+ if (!mmu_notifier_range_blockable(range))
498
+ non_block_start();
499
+ _ret = ops->invalidate_range_start(subscription, range);
500
+ if (!mmu_notifier_range_blockable(range))
501
+ non_block_end();
189502 if (_ret) {
190503 pr_info("%pS callback failed with %d in %sblockable context.\n",
191
- mn->ops->invalidate_range_start, _ret,
192
- !blockable ? "non-" : "");
504
+ ops->invalidate_range_start, _ret,
505
+ !mmu_notifier_range_blockable(range) ?
506
+ "non-" :
507
+ "");
508
+ WARN_ON(mmu_notifier_range_blockable(range) ||
509
+ _ret != -EAGAIN);
510
+ /*
511
+ * We call all the notifiers on any EAGAIN,
512
+ * there is no way for a notifier to know if
513
+ * its start method failed, thus a start that
514
+ * does EAGAIN can't also do end.
515
+ */
516
+ WARN_ON(ops->invalidate_range_end);
193517 ret = _ret;
194518 }
519
+ }
520
+ }
521
+
522
+ if (ret) {
523
+ /*
524
+ * Must be non-blocking to get here. If there are multiple
525
+ * notifiers and one or more failed start, any that succeeded
526
+ * start are expecting their end to be called. Do so now.
527
+ */
528
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list,
529
+ hlist, srcu_read_lock_held(&srcu)) {
530
+ if (!subscription->ops->invalidate_range_end)
531
+ continue;
532
+
533
+ subscription->ops->invalidate_range_end(subscription,
534
+ range);
195535 }
196536 }
197537 srcu_read_unlock(&srcu, id);
198538
199539 return ret;
200540 }
201
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
202541
203
-void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
204
- unsigned long start,
205
- unsigned long end,
206
- bool only_end)
542
+int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
207543 {
208
- struct mmu_notifier *mn;
544
+ struct mmu_notifier_subscriptions *subscriptions =
545
+ range->mm->notifier_subscriptions;
546
+ int ret;
547
+
548
+ if (subscriptions->has_itree) {
549
+ ret = mn_itree_invalidate(subscriptions, range);
550
+ if (ret)
551
+ return ret;
552
+ }
553
+ if (!hlist_empty(&subscriptions->list))
554
+ return mn_hlist_invalidate_range_start(subscriptions, range);
555
+ return 0;
556
+}
557
+
558
+static void
559
+mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
560
+ struct mmu_notifier_range *range, bool only_end)
561
+{
562
+ struct mmu_notifier *subscription;
209563 int id;
210564
211565 id = srcu_read_lock(&srcu);
212
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
566
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
567
+ srcu_read_lock_held(&srcu)) {
213568 /*
214569 * Call invalidate_range here too to avoid the need for the
215570 * subsystem of having to register an invalidate_range_end
....@@ -223,89 +578,128 @@
223578 * is safe to do when we know that a call to invalidate_range()
224579 * already happen under page table lock.
225580 */
226
- if (!only_end && mn->ops->invalidate_range)
227
- mn->ops->invalidate_range(mn, mm, start, end);
228
- if (mn->ops->invalidate_range_end)
229
- mn->ops->invalidate_range_end(mn, mm, start, end);
581
+ if (!only_end && subscription->ops->invalidate_range)
582
+ subscription->ops->invalidate_range(subscription,
583
+ range->mm,
584
+ range->start,
585
+ range->end);
586
+ if (subscription->ops->invalidate_range_end) {
587
+ if (!mmu_notifier_range_blockable(range))
588
+ non_block_start();
589
+ subscription->ops->invalidate_range_end(subscription,
590
+ range);
591
+ if (!mmu_notifier_range_blockable(range))
592
+ non_block_end();
593
+ }
230594 }
231595 srcu_read_unlock(&srcu, id);
232596 }
233
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
597
+
598
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
599
+ bool only_end)
600
+{
601
+ struct mmu_notifier_subscriptions *subscriptions =
602
+ range->mm->notifier_subscriptions;
603
+
604
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
605
+ if (subscriptions->has_itree)
606
+ mn_itree_inv_end(subscriptions);
607
+
608
+ if (!hlist_empty(&subscriptions->list))
609
+ mn_hlist_invalidate_end(subscriptions, range, only_end);
610
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
611
+}
234612
235613 void __mmu_notifier_invalidate_range(struct mm_struct *mm,
236614 unsigned long start, unsigned long end)
237615 {
238
- struct mmu_notifier *mn;
616
+ struct mmu_notifier *subscription;
239617 int id;
240618
241619 id = srcu_read_lock(&srcu);
242
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
243
- if (mn->ops->invalidate_range)
244
- mn->ops->invalidate_range(mn, mm, start, end);
620
+ hlist_for_each_entry_rcu(subscription,
621
+ &mm->notifier_subscriptions->list, hlist,
622
+ srcu_read_lock_held(&srcu)) {
623
+ if (subscription->ops->invalidate_range)
624
+ subscription->ops->invalidate_range(subscription, mm,
625
+ start, end);
245626 }
246627 srcu_read_unlock(&srcu, id);
247628 }
248
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
629
+
630
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
631
+
632
+static inline void mmu_notifier_write_lock(struct mm_struct *mm)
633
+{
634
+ percpu_down_write(
635
+ &mm->notifier_subscriptions->hdr.mmu_notifier_lock->rw_sem);
636
+}
637
+
638
+static inline void mmu_notifier_write_unlock(struct mm_struct *mm)
639
+{
640
+ percpu_up_write(
641
+ &mm->notifier_subscriptions->hdr.mmu_notifier_lock->rw_sem);
642
+}
643
+
644
+#else /* CONFIG_SPECULATIVE_PAGE_FAULT */
645
+
646
+static inline void mmu_notifier_write_lock(struct mm_struct *mm) {}
647
+static inline void mmu_notifier_write_unlock(struct mm_struct *mm) {}
648
+
649
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
650
+
651
+static void init_subscriptions(struct mmu_notifier_subscriptions *subscriptions)
652
+{
653
+ INIT_HLIST_HEAD(&subscriptions->list);
654
+ spin_lock_init(&subscriptions->lock);
655
+ subscriptions->invalidate_seq = 2;
656
+ subscriptions->itree = RB_ROOT_CACHED;
657
+ init_waitqueue_head(&subscriptions->wq);
658
+ INIT_HLIST_HEAD(&subscriptions->deferred_list);
659
+}
249660
250661 /*
251
- * Must be called while holding mm->mmap_sem for either read or write.
252
- * The result is guaranteed to be valid until mm->mmap_sem is dropped.
662
+ * Same as mmu_notifier_register but here the caller must hold the mmap_lock in
663
+ * write mode. A NULL mn signals the notifier is being registered for itree
664
+ * mode.
253665 */
254
-bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
666
+int __mmu_notifier_register(struct mmu_notifier *subscription,
667
+ struct mm_struct *mm)
255668 {
256
- struct mmu_notifier *mn;
257
- int id;
258
- bool ret = false;
259
-
260
- WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
261
-
262
- if (!mm_has_notifiers(mm))
263
- return ret;
264
-
265
- id = srcu_read_lock(&srcu);
266
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
267
- if (!mn->ops->invalidate_range &&
268
- !mn->ops->invalidate_range_start &&
269
- !mn->ops->invalidate_range_end)
270
- continue;
271
-
272
- if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
273
- ret = true;
274
- break;
275
- }
276
- }
277
- srcu_read_unlock(&srcu, id);
278
- return ret;
279
-}
280
-
281
-static int do_mmu_notifier_register(struct mmu_notifier *mn,
282
- struct mm_struct *mm,
283
- int take_mmap_sem)
284
-{
285
- struct mmu_notifier_mm *mmu_notifier_mm;
669
+ struct mmu_notifier_subscriptions *subscriptions = NULL;
286670 int ret;
287671
672
+ mmap_assert_write_locked(mm);
288673 BUG_ON(atomic_read(&mm->mm_users) <= 0);
289674
290
- ret = -ENOMEM;
291
- mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
292
- if (unlikely(!mmu_notifier_mm))
293
- goto out;
294
-
295
- if (take_mmap_sem)
296
- down_write(&mm->mmap_sem);
297
- ret = mm_take_all_locks(mm);
298
- if (unlikely(ret))
299
- goto out_clean;
300
-
301
- if (!mm_has_notifiers(mm)) {
302
- INIT_HLIST_HEAD(&mmu_notifier_mm->list);
303
- spin_lock_init(&mmu_notifier_mm->lock);
304
-
305
- mm->mmu_notifier_mm = mmu_notifier_mm;
306
- mmu_notifier_mm = NULL;
675
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
676
+ fs_reclaim_acquire(GFP_KERNEL);
677
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
678
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
679
+ fs_reclaim_release(GFP_KERNEL);
307680 }
308
- mmgrab(mm);
681
+
682
+ if (!mm->notifier_subscriptions) {
683
+ /*
684
+ * kmalloc cannot be called under mm_take_all_locks(), but we
685
+ * know that mm->notifier_subscriptions can't change while we
686
+ * hold the write side of the mmap_lock.
687
+ */
688
+ subscriptions = kzalloc(
689
+ sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
690
+ if (!subscriptions)
691
+ return -ENOMEM;
692
+
693
+ init_subscriptions(subscriptions);
694
+ }
695
+
696
+ mmu_notifier_write_lock(mm);
697
+
698
+ ret = mm_take_all_locks(mm);
699
+ if (unlikely(ret)) {
700
+ mmu_notifier_write_unlock(mm);
701
+ goto out_clean;
702
+ }
309703
310704 /*
311705 * Serialize the update against mmu_notifier_unregister. A
....@@ -314,56 +708,148 @@
314708 * current->mm or explicitly with get_task_mm() or similar).
315709 * We can't race against any other mmu notifier method either
316710 * thanks to mm_take_all_locks().
711
+ *
712
+ * release semantics on the initialization of the
713
+ * mmu_notifier_subscriptions's contents are provided for unlocked
714
+ * readers. acquire can only be used while holding the mmgrab or
715
+ * mmget, and is safe because once created the
716
+ * mmu_notifier_subscriptions is not freed until the mm is destroyed.
717
+ * As above, users holding the mmap_lock or one of the
718
+ * mm_take_all_locks() do not need to use acquire semantics.
317719 */
318
- spin_lock(&mm->mmu_notifier_mm->lock);
319
- hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
320
- spin_unlock(&mm->mmu_notifier_mm->lock);
720
+ if (subscriptions)
721
+ smp_store_release(&mm->notifier_subscriptions, subscriptions);
722
+ mm->notifier_subscriptions->hdr.valid = true;
723
+
724
+ if (subscription) {
725
+ /* Pairs with the mmdrop in mmu_notifier_unregister_* */
726
+ mmgrab(mm);
727
+ subscription->mm = mm;
728
+ subscription->users = 1;
729
+
730
+ spin_lock(&mm->notifier_subscriptions->lock);
731
+ hlist_add_head_rcu(&subscription->hlist,
732
+ &mm->notifier_subscriptions->list);
733
+ spin_unlock(&mm->notifier_subscriptions->lock);
734
+ } else
735
+ mm->notifier_subscriptions->has_itree = true;
321736
322737 mm_drop_all_locks(mm);
323
-out_clean:
324
- if (take_mmap_sem)
325
- up_write(&mm->mmap_sem);
326
- kfree(mmu_notifier_mm);
327
-out:
738
+ mmu_notifier_write_unlock(mm);
328739 BUG_ON(atomic_read(&mm->mm_users) <= 0);
740
+ return 0;
741
+
742
+out_clean:
743
+ kfree(subscriptions);
329744 return ret;
330745 }
746
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
331747
332
-/*
333
- * Must not hold mmap_sem nor any other VM related lock when calling
748
+/**
749
+ * mmu_notifier_register - Register a notifier on a mm
750
+ * @subscription: The notifier to attach
751
+ * @mm: The mm to attach the notifier to
752
+ *
753
+ * Must not hold mmap_lock nor any other VM related lock when calling
334754 * this registration function. Must also ensure mm_users can't go down
335755 * to zero while this runs to avoid races with mmu_notifier_release,
336756 * so mm has to be current->mm or the mm should be pinned safely such
337757 * as with get_task_mm(). If the mm is not current->mm, the mm_users
338758 * pin should be released by calling mmput after mmu_notifier_register
339
- * returns. mmu_notifier_unregister must be always called to
340
- * unregister the notifier. mm_count is automatically pinned to allow
341
- * mmu_notifier_unregister to safely run at any time later, before or
342
- * after exit_mmap. ->release will always be called before exit_mmap
343
- * frees the pages.
759
+ * returns.
760
+ *
761
+ * mmu_notifier_unregister() or mmu_notifier_put() must be always called to
762
+ * unregister the notifier.
763
+ *
764
+ * While the caller has a mmu_notifier get the subscription->mm pointer will remain
765
+ * valid, and can be converted to an active mm pointer via mmget_not_zero().
344766 */
345
-int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
767
+int mmu_notifier_register(struct mmu_notifier *subscription,
768
+ struct mm_struct *mm)
346769 {
347
- return do_mmu_notifier_register(mn, mm, 1);
770
+ int ret;
771
+
772
+ mmap_write_lock(mm);
773
+ ret = __mmu_notifier_register(subscription, mm);
774
+ mmap_write_unlock(mm);
775
+ return ret;
348776 }
349777 EXPORT_SYMBOL_GPL(mmu_notifier_register);
350778
351
-/*
352
- * Same as mmu_notifier_register but here the caller must hold the
353
- * mmap_sem in write mode.
354
- */
355
-int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
779
+static struct mmu_notifier *
780
+find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
356781 {
357
- return do_mmu_notifier_register(mn, mm, 0);
782
+ struct mmu_notifier *subscription;
783
+
784
+ spin_lock(&mm->notifier_subscriptions->lock);
785
+ hlist_for_each_entry_rcu(subscription,
786
+ &mm->notifier_subscriptions->list, hlist,
787
+ lockdep_is_held(&mm->notifier_subscriptions->lock)) {
788
+ if (subscription->ops != ops)
789
+ continue;
790
+
791
+ if (likely(subscription->users != UINT_MAX))
792
+ subscription->users++;
793
+ else
794
+ subscription = ERR_PTR(-EOVERFLOW);
795
+ spin_unlock(&mm->notifier_subscriptions->lock);
796
+ return subscription;
797
+ }
798
+ spin_unlock(&mm->notifier_subscriptions->lock);
799
+ return NULL;
358800 }
359
-EXPORT_SYMBOL_GPL(__mmu_notifier_register);
801
+
802
+/**
803
+ * mmu_notifier_get_locked - Return the single struct mmu_notifier for
804
+ * the mm & ops
805
+ * @ops: The operations struct being subscribe with
806
+ * @mm : The mm to attach notifiers too
807
+ *
808
+ * This function either allocates a new mmu_notifier via
809
+ * ops->alloc_notifier(), or returns an already existing notifier on the
810
+ * list. The value of the ops pointer is used to determine when two notifiers
811
+ * are the same.
812
+ *
813
+ * Each call to mmu_notifier_get() must be paired with a call to
814
+ * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock.
815
+ *
816
+ * While the caller has a mmu_notifier get the mm pointer will remain valid,
817
+ * and can be converted to an active mm pointer via mmget_not_zero().
818
+ */
819
+struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
820
+ struct mm_struct *mm)
821
+{
822
+ struct mmu_notifier *subscription;
823
+ int ret;
824
+
825
+ mmap_assert_write_locked(mm);
826
+
827
+ if (mm->notifier_subscriptions) {
828
+ subscription = find_get_mmu_notifier(mm, ops);
829
+ if (subscription)
830
+ return subscription;
831
+ }
832
+
833
+ subscription = ops->alloc_notifier(mm);
834
+ if (IS_ERR(subscription))
835
+ return subscription;
836
+ subscription->ops = ops;
837
+ ret = __mmu_notifier_register(subscription, mm);
838
+ if (ret)
839
+ goto out_free;
840
+ return subscription;
841
+out_free:
842
+ subscription->ops->free_notifier(subscription);
843
+ return ERR_PTR(ret);
844
+}
845
+EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);
360846
361847 /* this is called after the last mmu_notifier_unregister() returned */
362
-void __mmu_notifier_mm_destroy(struct mm_struct *mm)
848
+void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
363849 {
364
- BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
365
- kfree(mm->mmu_notifier_mm);
366
- mm->mmu_notifier_mm = LIST_POISON1; /* debug */
850
+ BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list));
851
+ kfree(mm->notifier_subscriptions);
852
+ mm->notifier_subscriptions = LIST_POISON1; /* debug */
367853 }
368854
369855 /*
....@@ -376,11 +862,12 @@
376862 * and only after mmu_notifier_unregister returned we're guaranteed
377863 * that ->release or any other method can't run anymore.
378864 */
379
-void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
865
+void mmu_notifier_unregister(struct mmu_notifier *subscription,
866
+ struct mm_struct *mm)
380867 {
381868 BUG_ON(atomic_read(&mm->mm_count) <= 0);
382869
383
- if (!hlist_unhashed(&mn->hlist)) {
870
+ if (!hlist_unhashed(&subscription->hlist)) {
384871 /*
385872 * SRCU here will force exit_mmap to wait for ->release to
386873 * finish before freeing the pages.
....@@ -392,17 +879,17 @@
392879 * exit_mmap will block in mmu_notifier_release to guarantee
393880 * that ->release is called before freeing the pages.
394881 */
395
- if (mn->ops->release)
396
- mn->ops->release(mn, mm);
882
+ if (subscription->ops->release)
883
+ subscription->ops->release(subscription, mm);
397884 srcu_read_unlock(&srcu, id);
398885
399
- spin_lock(&mm->mmu_notifier_mm->lock);
886
+ spin_lock(&mm->notifier_subscriptions->lock);
400887 /*
401888 * Can not use list_del_rcu() since __mmu_notifier_release
402889 * can delete it before we hold the lock.
403890 */
404
- hlist_del_init_rcu(&mn->hlist);
405
- spin_unlock(&mm->mmu_notifier_mm->lock);
891
+ hlist_del_init_rcu(&subscription->hlist);
892
+ spin_unlock(&mm->notifier_subscriptions->lock);
406893 }
407894
408895 /*
....@@ -417,21 +904,312 @@
417904 }
418905 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
419906
420
-/*
421
- * Same as mmu_notifier_unregister but no callback and no srcu synchronization.
422
- */
423
-void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
424
- struct mm_struct *mm)
907
+static void mmu_notifier_free_rcu(struct rcu_head *rcu)
425908 {
426
- spin_lock(&mm->mmu_notifier_mm->lock);
427
- /*
428
- * Can not use list_del_rcu() since __mmu_notifier_release
429
- * can delete it before we hold the lock.
430
- */
431
- hlist_del_init_rcu(&mn->hlist);
432
- spin_unlock(&mm->mmu_notifier_mm->lock);
909
+ struct mmu_notifier *subscription =
910
+ container_of(rcu, struct mmu_notifier, rcu);
911
+ struct mm_struct *mm = subscription->mm;
433912
434
- BUG_ON(atomic_read(&mm->mm_count) <= 0);
913
+ subscription->ops->free_notifier(subscription);
914
+ /* Pairs with the get in __mmu_notifier_register() */
435915 mmdrop(mm);
436916 }
437
-EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
917
+
918
+/**
919
+ * mmu_notifier_put - Release the reference on the notifier
920
+ * @subscription: The notifier to act on
921
+ *
922
+ * This function must be paired with each mmu_notifier_get(), it releases the
923
+ * reference obtained by the get. If this is the last reference then process
924
+ * to free the notifier will be run asynchronously.
925
+ *
926
+ * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release
927
+ * when the mm_struct is destroyed. Instead free_notifier is always called to
928
+ * release any resources held by the user.
929
+ *
930
+ * As ops->release is not guaranteed to be called, the user must ensure that
931
+ * all sptes are dropped, and no new sptes can be established before
932
+ * mmu_notifier_put() is called.
933
+ *
934
+ * This function can be called from the ops->release callback, however the
935
+ * caller must still ensure it is called pairwise with mmu_notifier_get().
936
+ *
937
+ * Modules calling this function must call mmu_notifier_synchronize() in
938
+ * their __exit functions to ensure the async work is completed.
939
+ */
940
+void mmu_notifier_put(struct mmu_notifier *subscription)
941
+{
942
+ struct mm_struct *mm = subscription->mm;
943
+
944
+ spin_lock(&mm->notifier_subscriptions->lock);
945
+ if (WARN_ON(!subscription->users) || --subscription->users)
946
+ goto out_unlock;
947
+ hlist_del_init_rcu(&subscription->hlist);
948
+ spin_unlock(&mm->notifier_subscriptions->lock);
949
+
950
+ call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu);
951
+ return;
952
+
953
+out_unlock:
954
+ spin_unlock(&mm->notifier_subscriptions->lock);
955
+}
956
+EXPORT_SYMBOL_GPL(mmu_notifier_put);
957
+
958
+static int __mmu_interval_notifier_insert(
959
+ struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
960
+ struct mmu_notifier_subscriptions *subscriptions, unsigned long start,
961
+ unsigned long length, const struct mmu_interval_notifier_ops *ops)
962
+{
963
+ interval_sub->mm = mm;
964
+ interval_sub->ops = ops;
965
+ RB_CLEAR_NODE(&interval_sub->interval_tree.rb);
966
+ interval_sub->interval_tree.start = start;
967
+ /*
968
+ * Note that the representation of the intervals in the interval tree
969
+ * considers the ending point as contained in the interval.
970
+ */
971
+ if (length == 0 ||
972
+ check_add_overflow(start, length - 1,
973
+ &interval_sub->interval_tree.last))
974
+ return -EOVERFLOW;
975
+
976
+ /* Must call with a mmget() held */
977
+ if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
978
+ return -EINVAL;
979
+
980
+ /* pairs with mmdrop in mmu_interval_notifier_remove() */
981
+ mmgrab(mm);
982
+
983
+ /*
984
+ * If some invalidate_range_start/end region is going on in parallel
985
+ * we don't know what VA ranges are affected, so we must assume this
986
+ * new range is included.
987
+ *
988
+ * If the itree is invalidating then we are not allowed to change
989
+ * it. Retrying until invalidation is done is tricky due to the
990
+ * possibility for live lock, instead defer the add to
991
+ * mn_itree_inv_end() so this algorithm is deterministic.
992
+ *
993
+ * In all cases the value for the interval_sub->invalidate_seq should be
994
+ * odd, see mmu_interval_read_begin()
995
+ */
996
+ spin_lock(&subscriptions->lock);
997
+ if (subscriptions->active_invalidate_ranges) {
998
+ if (mn_itree_is_invalidating(subscriptions))
999
+ hlist_add_head(&interval_sub->deferred_item,
1000
+ &subscriptions->deferred_list);
1001
+ else {
1002
+ subscriptions->invalidate_seq |= 1;
1003
+ interval_tree_insert(&interval_sub->interval_tree,
1004
+ &subscriptions->itree);
1005
+ }
1006
+ interval_sub->invalidate_seq = subscriptions->invalidate_seq;
1007
+ } else {
1008
+ WARN_ON(mn_itree_is_invalidating(subscriptions));
1009
+ /*
1010
+ * The starting seq for a subscription not under invalidation
1011
+ * should be odd, not equal to the current invalidate_seq and
1012
+ * invalidate_seq should not 'wrap' to the new seq any time
1013
+ * soon.
1014
+ */
1015
+ interval_sub->invalidate_seq =
1016
+ subscriptions->invalidate_seq - 1;
1017
+ interval_tree_insert(&interval_sub->interval_tree,
1018
+ &subscriptions->itree);
1019
+ }
1020
+ spin_unlock(&subscriptions->lock);
1021
+ return 0;
1022
+}
1023
+
1024
+/**
1025
+ * mmu_interval_notifier_insert - Insert an interval notifier
1026
+ * @interval_sub: Interval subscription to register
1027
+ * @start: Starting virtual address to monitor
1028
+ * @length: Length of the range to monitor
1029
+ * @mm: mm_struct to attach to
1030
+ * @ops: Interval notifier operations to be called on matching events
1031
+ *
1032
+ * This function subscribes the interval notifier for notifications from the
1033
+ * mm. Upon return the ops related to mmu_interval_notifier will be called
1034
+ * whenever an event that intersects with the given range occurs.
1035
+ *
1036
+ * Upon return the range_notifier may not be present in the interval tree yet.
1037
+ * The caller must use the normal interval notifier read flow via
1038
+ * mmu_interval_read_begin() to establish SPTEs for this range.
1039
+ */
1040
+int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
1041
+ struct mm_struct *mm, unsigned long start,
1042
+ unsigned long length,
1043
+ const struct mmu_interval_notifier_ops *ops)
1044
+{
1045
+ struct mmu_notifier_subscriptions *subscriptions;
1046
+ int ret;
1047
+
1048
+ might_lock(&mm->mmap_lock);
1049
+
1050
+ subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
1051
+ if (!subscriptions || !subscriptions->has_itree) {
1052
+ ret = mmu_notifier_register(NULL, mm);
1053
+ if (ret)
1054
+ return ret;
1055
+ subscriptions = mm->notifier_subscriptions;
1056
+ }
1057
+ return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
1058
+ start, length, ops);
1059
+}
1060
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
1061
+
1062
+int mmu_interval_notifier_insert_locked(
1063
+ struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
1064
+ unsigned long start, unsigned long length,
1065
+ const struct mmu_interval_notifier_ops *ops)
1066
+{
1067
+ struct mmu_notifier_subscriptions *subscriptions =
1068
+ mm->notifier_subscriptions;
1069
+ int ret;
1070
+
1071
+ mmap_assert_write_locked(mm);
1072
+
1073
+ if (!subscriptions || !subscriptions->has_itree) {
1074
+ ret = __mmu_notifier_register(NULL, mm);
1075
+ if (ret)
1076
+ return ret;
1077
+ subscriptions = mm->notifier_subscriptions;
1078
+ }
1079
+ return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
1080
+ start, length, ops);
1081
+}
1082
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
1083
+
1084
+static bool
1085
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
1086
+ unsigned long seq)
1087
+{
1088
+ bool ret;
1089
+
1090
+ spin_lock(&subscriptions->lock);
1091
+ ret = subscriptions->invalidate_seq != seq;
1092
+ spin_unlock(&subscriptions->lock);
1093
+ return ret;
1094
+}
1095
+
1096
+/**
1097
+ * mmu_interval_notifier_remove - Remove a interval notifier
1098
+ * @interval_sub: Interval subscription to unregister
1099
+ *
1100
+ * This function must be paired with mmu_interval_notifier_insert(). It cannot
1101
+ * be called from any ops callback.
1102
+ *
1103
+ * Once this returns ops callbacks are no longer running on other CPUs and
1104
+ * will not be called in future.
1105
+ */
1106
+void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
1107
+{
1108
+ struct mm_struct *mm = interval_sub->mm;
1109
+ struct mmu_notifier_subscriptions *subscriptions =
1110
+ mm->notifier_subscriptions;
1111
+ unsigned long seq = 0;
1112
+
1113
+ might_sleep();
1114
+
1115
+ spin_lock(&subscriptions->lock);
1116
+ if (mn_itree_is_invalidating(subscriptions)) {
1117
+ /*
1118
+ * remove is being called after insert put this on the
1119
+ * deferred list, but before the deferred list was processed.
1120
+ */
1121
+ if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) {
1122
+ hlist_del(&interval_sub->deferred_item);
1123
+ } else {
1124
+ hlist_add_head(&interval_sub->deferred_item,
1125
+ &subscriptions->deferred_list);
1126
+ seq = subscriptions->invalidate_seq;
1127
+ }
1128
+ } else {
1129
+ WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb));
1130
+ interval_tree_remove(&interval_sub->interval_tree,
1131
+ &subscriptions->itree);
1132
+ }
1133
+ spin_unlock(&subscriptions->lock);
1134
+
1135
+ /*
1136
+ * The possible sleep on progress in the invalidation requires the
1137
+ * caller not hold any locks held by invalidation callbacks.
1138
+ */
1139
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
1140
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
1141
+ if (seq)
1142
+ wait_event(subscriptions->wq,
1143
+ mmu_interval_seq_released(subscriptions, seq));
1144
+
1145
+ /* pairs with mmgrab in mmu_interval_notifier_insert() */
1146
+ mmdrop(mm);
1147
+}
1148
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove);
1149
+
1150
+/**
1151
+ * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
1152
+ *
1153
+ * This function ensures that all outstanding async SRU work from
1154
+ * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops
1155
+ * associated with an unused mmu_notifier will no longer be called.
1156
+ *
1157
+ * Before using the caller must ensure that all of its mmu_notifiers have been
1158
+ * fully released via mmu_notifier_put().
1159
+ *
1160
+ * Modules using the mmu_notifier_put() API should call this in their __exit
1161
+ * function to avoid module unloading races.
1162
+ */
1163
+void mmu_notifier_synchronize(void)
1164
+{
1165
+ synchronize_srcu(&srcu);
1166
+}
1167
+EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
1168
+
1169
+bool
1170
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
1171
+{
1172
+ if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
1173
+ return false;
1174
+ /* Return true if the vma still have the read flag set. */
1175
+ return range->vma->vm_flags & VM_READ;
1176
+}
1177
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
1178
+
1179
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
1180
+
1181
+bool mmu_notifier_subscriptions_init(struct mm_struct *mm)
1182
+{
1183
+ struct mmu_notifier_subscriptions *subscriptions;
1184
+ struct percpu_rw_semaphore_atomic *sem;
1185
+
1186
+ subscriptions = kzalloc(
1187
+ sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
1188
+ if (!subscriptions)
1189
+ return false;
1190
+
1191
+ sem = kzalloc(sizeof(struct percpu_rw_semaphore_atomic), GFP_KERNEL);
1192
+ if (!sem) {
1193
+ kfree(subscriptions);
1194
+ return false;
1195
+ }
1196
+ percpu_init_rwsem(&sem->rw_sem);
1197
+
1198
+ init_subscriptions(subscriptions);
1199
+ subscriptions->has_itree = true;
1200
+ subscriptions->hdr.valid = false;
1201
+ subscriptions->hdr.mmu_notifier_lock = sem;
1202
+ mm->notifier_subscriptions = subscriptions;
1203
+
1204
+ return true;
1205
+}
1206
+
1207
+void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
1208
+{
1209
+ percpu_rwsem_async_destroy(
1210
+ mm->notifier_subscriptions->hdr.mmu_notifier_lock);
1211
+ kfree(mm->notifier_subscriptions);
1212
+ mm->notifier_subscriptions = NULL;
1213
+}
1214
+
1215
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */