hc
2024-05-10 61598093bbdd283a7edc367d900f223070ead8d2
kernel/kernel/sched/membarrier.c
....@@ -1,17 +1,8 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
34 *
45 * membarrier system call
5
- *
6
- * This program is free software; you can redistribute it and/or modify
7
- * it under the terms of the GNU General Public License as published by
8
- * the Free Software Foundation; either version 2 of the License, or
9
- * (at your option) any later version.
10
- *
11
- * This program is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- * GNU General Public License for more details.
156 */
167 #include "sched.h"
178
....@@ -27,22 +18,93 @@
2718 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
2819 #endif
2920
21
+#ifdef CONFIG_RSEQ
22
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
23
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
24
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
25
+#else
26
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
27
+#endif
28
+
3029 #define MEMBARRIER_CMD_BITMASK \
3130 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
3231 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
3332 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
3433 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
35
- | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
34
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
35
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
3636
3737 static void ipi_mb(void *info)
3838 {
3939 smp_mb(); /* IPIs should be serializing but paranoid. */
4040 }
4141
42
+static void ipi_sync_core(void *info)
43
+{
44
+ /*
45
+ * The smp_mb() in membarrier after all the IPIs is supposed to
46
+ * ensure that memory on remote CPUs that occur before the IPI
47
+ * become visible to membarrier()'s caller -- see scenario B in
48
+ * the big comment at the top of this file.
49
+ *
50
+ * A sync_core() would provide this guarantee, but
51
+ * sync_core_before_usermode() might end up being deferred until
52
+ * after membarrier()'s smp_mb().
53
+ */
54
+ smp_mb(); /* IPIs should be serializing but paranoid. */
55
+
56
+ sync_core_before_usermode();
57
+}
58
+
59
+static void ipi_rseq(void *info)
60
+{
61
+ /*
62
+ * Ensure that all stores done by the calling thread are visible
63
+ * to the current task before the current task resumes. We could
64
+ * probably optimize this away on most architectures, but by the
65
+ * time we've already sent an IPI, the cost of the extra smp_mb()
66
+ * is negligible.
67
+ */
68
+ smp_mb();
69
+ rseq_preempt(current);
70
+}
71
+
72
+static void ipi_sync_rq_state(void *info)
73
+{
74
+ struct mm_struct *mm = (struct mm_struct *) info;
75
+
76
+ if (current->mm != mm)
77
+ return;
78
+ this_cpu_write(runqueues.membarrier_state,
79
+ atomic_read(&mm->membarrier_state));
80
+ /*
81
+ * Issue a memory barrier after setting
82
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
83
+ * guarantee that no memory access following registration is reordered
84
+ * before registration.
85
+ */
86
+ smp_mb();
87
+}
88
+
89
+void membarrier_exec_mmap(struct mm_struct *mm)
90
+{
91
+ /*
92
+ * Issue a memory barrier before clearing membarrier_state to
93
+ * guarantee that no memory access prior to exec is reordered after
94
+ * clearing this state.
95
+ */
96
+ smp_mb();
97
+ atomic_set(&mm->membarrier_state, 0);
98
+ /*
99
+ * Keep the runqueue membarrier_state in sync with this mm
100
+ * membarrier_state.
101
+ */
102
+ this_cpu_write(runqueues.membarrier_state, 0);
103
+}
104
+
42105 static int membarrier_global_expedited(void)
43106 {
44107 int cpu;
45
- bool fallback = false;
46108 cpumask_var_t tmpmask;
47109
48110 if (num_online_cpus() == 1)
....@@ -54,17 +116,11 @@
54116 */
55117 smp_mb(); /* system call entry is not a mb. */
56118
57
- /*
58
- * Expedited membarrier commands guarantee that they won't
59
- * block, hence the GFP_NOWAIT allocation flag and fallback
60
- * implementation.
61
- */
62
- if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
63
- /* Fallback for OOM. */
64
- fallback = true;
65
- }
119
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
120
+ return -ENOMEM;
66121
67122 cpus_read_lock();
123
+ rcu_read_lock();
68124 for_each_online_cpu(cpu) {
69125 struct task_struct *p;
70126
....@@ -79,23 +135,28 @@
79135 if (cpu == raw_smp_processor_id())
80136 continue;
81137
82
- rcu_read_lock();
83
- p = task_rcu_dereference(&cpu_rq(cpu)->curr);
84
- if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
85
- MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
86
- if (!fallback)
87
- __cpumask_set_cpu(cpu, tmpmask);
88
- else
89
- smp_call_function_single(cpu, ipi_mb, NULL, 1);
90
- }
91
- rcu_read_unlock();
138
+ if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
139
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED))
140
+ continue;
141
+
142
+ /*
143
+ * Skip the CPU if it runs a kernel thread. The scheduler
144
+ * leaves the prior task mm in place as an optimization when
145
+ * scheduling a kthread.
146
+ */
147
+ p = rcu_dereference(cpu_rq(cpu)->curr);
148
+ if (p->flags & PF_KTHREAD)
149
+ continue;
150
+
151
+ __cpumask_set_cpu(cpu, tmpmask);
92152 }
93
- if (!fallback) {
94
- preempt_disable();
95
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
96
- preempt_enable();
97
- free_cpumask_var(tmpmask);
98
- }
153
+ rcu_read_unlock();
154
+
155
+ preempt_disable();
156
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
157
+ preempt_enable();
158
+
159
+ free_cpumask_var(tmpmask);
99160 cpus_read_unlock();
100161
101162 /*
....@@ -107,25 +168,35 @@
107168 return 0;
108169 }
109170
110
-static int membarrier_private_expedited(int flags)
171
+static int membarrier_private_expedited(int flags, int cpu_id)
111172 {
112
- int cpu;
113
- bool fallback = false;
114173 cpumask_var_t tmpmask;
174
+ struct mm_struct *mm = current->mm;
175
+ smp_call_func_t ipi_func = ipi_mb;
115176
116
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
177
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
117178 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
118179 return -EINVAL;
119
- if (!(atomic_read(&current->mm->membarrier_state) &
180
+ if (!(atomic_read(&mm->membarrier_state) &
120181 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
121182 return -EPERM;
183
+ ipi_func = ipi_sync_core;
184
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
185
+ if (!IS_ENABLED(CONFIG_RSEQ))
186
+ return -EINVAL;
187
+ if (!(atomic_read(&mm->membarrier_state) &
188
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
189
+ return -EPERM;
190
+ ipi_func = ipi_rseq;
122191 } else {
123
- if (!(atomic_read(&current->mm->membarrier_state) &
192
+ WARN_ON_ONCE(flags);
193
+ if (!(atomic_read(&mm->membarrier_state) &
124194 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
125195 return -EPERM;
126196 }
127197
128
- if (num_online_cpus() == 1)
198
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
199
+ (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
129200 return 0;
130201
131202 /*
....@@ -134,46 +205,73 @@
134205 */
135206 smp_mb(); /* system call entry is not a mb. */
136207
137
- /*
138
- * Expedited membarrier commands guarantee that they won't
139
- * block, hence the GFP_NOWAIT allocation flag and fallback
140
- * implementation.
141
- */
142
- if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
143
- /* Fallback for OOM. */
144
- fallback = true;
145
- }
208
+ if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
209
+ return -ENOMEM;
146210
147211 cpus_read_lock();
148
- for_each_online_cpu(cpu) {
212
+
213
+ if (cpu_id >= 0) {
149214 struct task_struct *p;
150215
151
- /*
152
- * Skipping the current CPU is OK even through we can be
153
- * migrated at any point. The current CPU, at the point
154
- * where we read raw_smp_processor_id(), is ensured to
155
- * be in program order with respect to the caller
156
- * thread. Therefore, we can skip this CPU from the
157
- * iteration.
158
- */
159
- if (cpu == raw_smp_processor_id())
160
- continue;
216
+ if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
217
+ goto out;
161218 rcu_read_lock();
162
- p = task_rcu_dereference(&cpu_rq(cpu)->curr);
163
- if (p && p->mm == current->mm) {
164
- if (!fallback)
219
+ p = rcu_dereference(cpu_rq(cpu_id)->curr);
220
+ if (!p || p->mm != mm) {
221
+ rcu_read_unlock();
222
+ goto out;
223
+ }
224
+ rcu_read_unlock();
225
+ } else {
226
+ int cpu;
227
+
228
+ rcu_read_lock();
229
+ for_each_online_cpu(cpu) {
230
+ struct task_struct *p;
231
+
232
+ p = rcu_dereference(cpu_rq(cpu)->curr);
233
+ if (p && p->mm == mm)
165234 __cpumask_set_cpu(cpu, tmpmask);
166
- else
167
- smp_call_function_single(cpu, ipi_mb, NULL, 1);
168235 }
169236 rcu_read_unlock();
170237 }
171
- if (!fallback) {
172
- preempt_disable();
173
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
174
- preempt_enable();
175
- free_cpumask_var(tmpmask);
238
+
239
+ if (cpu_id >= 0) {
240
+ /*
241
+ * smp_call_function_single() will call ipi_func() if cpu_id
242
+ * is the calling CPU.
243
+ */
244
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
245
+ } else {
246
+ /*
247
+ * For regular membarrier, we can save a few cycles by
248
+ * skipping the current cpu -- we're about to do smp_mb()
249
+ * below, and if we migrate to a different cpu, this cpu
250
+ * and the new cpu will execute a full barrier in the
251
+ * scheduler.
252
+ *
253
+ * For SYNC_CORE, we do need a barrier on the current cpu --
254
+ * otherwise, if we are migrated and replaced by a different
255
+ * task in the same mm just before, during, or after
256
+ * membarrier, we will end up with some thread in the mm
257
+ * running without a core sync.
258
+ *
259
+ * For RSEQ, don't rseq_preempt() the caller. User code
260
+ * is not supposed to issue syscalls at all from inside an
261
+ * rseq critical section.
262
+ */
263
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
264
+ preempt_disable();
265
+ smp_call_function_many(tmpmask, ipi_func, NULL, true);
266
+ preempt_enable();
267
+ } else {
268
+ on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
269
+ }
176270 }
271
+
272
+out:
273
+ if (cpu_id < 0)
274
+ free_cpumask_var(tmpmask);
177275 cpus_read_unlock();
178276
179277 /*
....@@ -182,6 +280,63 @@
182280 * rq->curr modification in scheduler.
183281 */
184282 smp_mb(); /* exit from system call is not a mb */
283
+
284
+ return 0;
285
+}
286
+
287
+static int sync_runqueues_membarrier_state(struct mm_struct *mm)
288
+{
289
+ int membarrier_state = atomic_read(&mm->membarrier_state);
290
+ cpumask_var_t tmpmask;
291
+ int cpu;
292
+
293
+ if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
294
+ this_cpu_write(runqueues.membarrier_state, membarrier_state);
295
+
296
+ /*
297
+ * For single mm user, we can simply issue a memory barrier
298
+ * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
299
+ * mm and in the current runqueue to guarantee that no memory
300
+ * access following registration is reordered before
301
+ * registration.
302
+ */
303
+ smp_mb();
304
+ return 0;
305
+ }
306
+
307
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
308
+ return -ENOMEM;
309
+
310
+ /*
311
+ * For mm with multiple users, we need to ensure all future
312
+ * scheduler executions will observe @mm's new membarrier
313
+ * state.
314
+ */
315
+ synchronize_rcu();
316
+
317
+ /*
318
+ * For each cpu runqueue, if the task's mm match @mm, ensure that all
319
+ * @mm's membarrier state set bits are also set in the runqueue's
320
+ * membarrier state. This ensures that a runqueue scheduling
321
+ * between threads which are users of @mm has its membarrier state
322
+ * updated.
323
+ */
324
+ cpus_read_lock();
325
+ rcu_read_lock();
326
+ for_each_online_cpu(cpu) {
327
+ struct rq *rq = cpu_rq(cpu);
328
+ struct task_struct *p;
329
+
330
+ p = rcu_dereference(rq->curr);
331
+ if (p && p->mm == mm)
332
+ __cpumask_set_cpu(cpu, tmpmask);
333
+ }
334
+ rcu_read_unlock();
335
+
336
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
337
+
338
+ free_cpumask_var(tmpmask);
339
+ cpus_read_unlock();
185340
186341 return 0;
187342 }
....@@ -190,28 +345,15 @@
190345 {
191346 struct task_struct *p = current;
192347 struct mm_struct *mm = p->mm;
348
+ int ret;
193349
194350 if (atomic_read(&mm->membarrier_state) &
195351 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
196352 return 0;
197353 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
198
- if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
199
- /*
200
- * For single mm user, single threaded process, we can
201
- * simply issue a memory barrier after setting
202
- * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
203
- * no memory access following registration is reordered
204
- * before registration.
205
- */
206
- smp_mb();
207
- } else {
208
- /*
209
- * For multi-mm user threads, we need to ensure all
210
- * future scheduler executions will observe the new
211
- * thread flag state for this mm.
212
- */
213
- synchronize_sched();
214
- }
354
+ ret = sync_runqueues_membarrier_state(mm);
355
+ if (ret)
356
+ return ret;
215357 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
216358 &mm->membarrier_state);
217359
....@@ -222,12 +364,22 @@
222364 {
223365 struct task_struct *p = current;
224366 struct mm_struct *mm = p->mm;
225
- int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
367
+ int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
368
+ set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
369
+ ret;
226370
227
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
371
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
228372 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
229373 return -EINVAL;
230
- state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
374
+ ready_state =
375
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
376
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
377
+ if (!IS_ENABLED(CONFIG_RSEQ))
378
+ return -EINVAL;
379
+ ready_state =
380
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
381
+ } else {
382
+ WARN_ON_ONCE(flags);
231383 }
232384
233385 /*
....@@ -235,35 +387,41 @@
235387 * groups, which use the same mm. (CLONE_VM but not
236388 * CLONE_THREAD).
237389 */
238
- if ((atomic_read(&mm->membarrier_state) & state) == state)
390
+ if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
239391 return 0;
240
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
241392 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
242
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
243
- &mm->membarrier_state);
244
- if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
245
- /*
246
- * Ensure all future scheduler executions will observe the
247
- * new thread flag state for this process.
248
- */
249
- synchronize_sched();
250
- }
251
- atomic_or(state, &mm->membarrier_state);
393
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
394
+ if (flags & MEMBARRIER_FLAG_RSEQ)
395
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
396
+ atomic_or(set_state, &mm->membarrier_state);
397
+ ret = sync_runqueues_membarrier_state(mm);
398
+ if (ret)
399
+ return ret;
400
+ atomic_or(ready_state, &mm->membarrier_state);
252401
253402 return 0;
254403 }
255404
256405 /**
257406 * sys_membarrier - issue memory barriers on a set of threads
258
- * @cmd: Takes command values defined in enum membarrier_cmd.
259
- * @flags: Currently needs to be 0. For future extensions.
407
+ * @cmd: Takes command values defined in enum membarrier_cmd.
408
+ * @flags: Currently needs to be 0 for all commands other than
409
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
410
+ * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
411
+ * contains the CPU on which to interrupt (= restart)
412
+ * the RSEQ critical section.
413
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
414
+ * RSEQ CS should be interrupted (@cmd must be
415
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
260416 *
261417 * If this system call is not implemented, -ENOSYS is returned. If the
262418 * command specified does not exist, not available on the running
263419 * kernel, or if the command argument is invalid, this system call
264420 * returns -EINVAL. For a given command, with flags argument set to 0,
265
- * this system call is guaranteed to always return the same value until
266
- * reboot.
421
+ * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
422
+ * always return the same value until reboot. In addition, it can return
423
+ * -ENOMEM if there is not enough memory available to perform the system
424
+ * call.
267425 *
268426 * All memory accesses performed in program order from each targeted thread
269427 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
....@@ -280,10 +438,21 @@
280438 * smp_mb() X O O
281439 * sys_membarrier() O O O
282440 */
283
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
441
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
284442 {
285
- if (unlikely(flags))
286
- return -EINVAL;
443
+ switch (cmd) {
444
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
445
+ if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
446
+ return -EINVAL;
447
+ break;
448
+ default:
449
+ if (unlikely(flags))
450
+ return -EINVAL;
451
+ }
452
+
453
+ if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
454
+ cpu_id = -1;
455
+
287456 switch (cmd) {
288457 case MEMBARRIER_CMD_QUERY:
289458 {
....@@ -298,20 +467,24 @@
298467 if (tick_nohz_full_enabled())
299468 return -EINVAL;
300469 if (num_online_cpus() > 1)
301
- synchronize_sched();
470
+ synchronize_rcu();
302471 return 0;
303472 case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
304473 return membarrier_global_expedited();
305474 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
306475 return membarrier_register_global_expedited();
307476 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
308
- return membarrier_private_expedited(0);
477
+ return membarrier_private_expedited(0, cpu_id);
309478 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
310479 return membarrier_register_private_expedited(0);
311480 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
312
- return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
481
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
313482 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
314483 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
484
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
485
+ return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
486
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
487
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
315488 default:
316489 return -EINVAL;
317490 }