hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/fs/btrfs/locking.c
....@@ -8,209 +8,398 @@
88 #include <linux/spinlock.h>
99 #include <linux/page-flags.h>
1010 #include <asm/bug.h>
11
+#include "misc.h"
1112 #include "ctree.h"
1213 #include "extent_io.h"
1314 #include "locking.h"
1415
15
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
16
-
1716 /*
18
- * if we currently have a spinning reader or writer lock
19
- * (indicated by the rw flag) this will bump the count
20
- * of blocking holders and drop the spinlock.
17
+ * Extent buffer locking
18
+ * =====================
19
+ *
20
+ * The locks use a custom scheme that allows to do more operations than are
21
+ * available fromt current locking primitives. The building blocks are still
22
+ * rwlock and wait queues.
23
+ *
24
+ * Required semantics:
25
+ *
26
+ * - reader/writer exclusion
27
+ * - writer/writer exclusion
28
+ * - reader/reader sharing
29
+ * - spinning lock semantics
30
+ * - blocking lock semantics
31
+ * - try-lock semantics for readers and writers
32
+ * - one level nesting, allowing read lock to be taken by the same thread that
33
+ * already has write lock
34
+ *
35
+ * The extent buffer locks (also called tree locks) manage access to eb data
36
+ * related to the storage in the b-tree (keys, items, but not the individual
37
+ * members of eb).
38
+ * We want concurrency of many readers and safe updates. The underlying locking
39
+ * is done by read-write spinlock and the blocking part is implemented using
40
+ * counters and wait queues.
41
+ *
42
+ * spinning semantics - the low-level rwlock is held so all other threads that
43
+ * want to take it are spinning on it.
44
+ *
45
+ * blocking semantics - the low-level rwlock is not held but the counter
46
+ * denotes how many times the blocking lock was held;
47
+ * sleeping is possible
48
+ *
49
+ * Write lock always allows only one thread to access the data.
50
+ *
51
+ *
52
+ * Debugging
53
+ * ---------
54
+ *
55
+ * There are additional state counters that are asserted in various contexts,
56
+ * removed from non-debug build to reduce extent_buffer size and for
57
+ * performance reasons.
58
+ *
59
+ *
60
+ * Lock recursion
61
+ * --------------
62
+ *
63
+ * A write operation on a tree might indirectly start a look up on the same
64
+ * tree. This can happen when btrfs_cow_block locks the tree and needs to
65
+ * lookup free extents.
66
+ *
67
+ * btrfs_cow_block
68
+ * ..
69
+ * alloc_tree_block_no_bg_flush
70
+ * btrfs_alloc_tree_block
71
+ * btrfs_reserve_extent
72
+ * ..
73
+ * load_free_space_cache
74
+ * ..
75
+ * btrfs_lookup_file_extent
76
+ * btrfs_search_slot
77
+ *
78
+ *
79
+ * Locking pattern - spinning
80
+ * --------------------------
81
+ *
82
+ * The simple locking scenario, the +--+ denotes the spinning section.
83
+ *
84
+ * +- btrfs_tree_lock
85
+ * | - extent_buffer::rwlock is held
86
+ * | - no heavy operations should happen, eg. IO, memory allocations, large
87
+ * | structure traversals
88
+ * +- btrfs_tree_unock
89
+*
90
+*
91
+ * Locking pattern - blocking
92
+ * --------------------------
93
+ *
94
+ * The blocking write uses the following scheme. The +--+ denotes the spinning
95
+ * section.
96
+ *
97
+ * +- btrfs_tree_lock
98
+ * |
99
+ * +- btrfs_set_lock_blocking_write
100
+ *
101
+ * - allowed: IO, memory allocations, etc.
102
+ *
103
+ * -- btrfs_tree_unlock - note, no explicit unblocking necessary
104
+ *
105
+ *
106
+ * Blocking read is similar.
107
+ *
108
+ * +- btrfs_tree_read_lock
109
+ * |
110
+ * +- btrfs_set_lock_blocking_read
111
+ *
112
+ * - heavy operations allowed
113
+ *
114
+ * +- btrfs_tree_read_unlock_blocking
115
+ * |
116
+ * +- btrfs_tree_read_unlock
117
+ *
21118 */
22
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
119
+
120
+#ifdef CONFIG_BTRFS_DEBUG
121
+static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
23122 {
24
- /*
25
- * no lock is required. The lock owner may change if
26
- * we have a read lock, but it won't change to or away
27
- * from us. If we have the write lock, we are the owner
28
- * and it'll never change.
29
- */
30
- if (eb->lock_nested && current->pid == eb->lock_owner)
31
- return;
32
- if (rw == BTRFS_WRITE_LOCK) {
33
- if (atomic_read(&eb->blocking_writers) == 0) {
34
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
35
- atomic_dec(&eb->spinning_writers);
36
- btrfs_assert_tree_locked(eb);
37
- atomic_inc(&eb->blocking_writers);
38
- write_unlock(&eb->lock);
39
- }
40
- } else if (rw == BTRFS_READ_LOCK) {
41
- btrfs_assert_tree_read_locked(eb);
42
- atomic_inc(&eb->blocking_readers);
43
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
44
- atomic_dec(&eb->spinning_readers);
45
- read_unlock(&eb->lock);
46
- }
123
+ WARN_ON(eb->spinning_writers);
124
+ eb->spinning_writers++;
47125 }
48126
49
-/*
50
- * if we currently have a blocking lock, take the spinlock
51
- * and drop our blocking count
52
- */
53
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
127
+static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
54128 {
55
- /*
56
- * no lock is required. The lock owner may change if
57
- * we have a read lock, but it won't change to or away
58
- * from us. If we have the write lock, we are the owner
59
- * and it'll never change.
60
- */
61
- if (eb->lock_nested && current->pid == eb->lock_owner)
62
- return;
63
-
64
- if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
65
- BUG_ON(atomic_read(&eb->blocking_writers) != 1);
66
- write_lock(&eb->lock);
67
- WARN_ON(atomic_read(&eb->spinning_writers));
68
- atomic_inc(&eb->spinning_writers);
69
- /* atomic_dec_and_test implies a barrier */
70
- if (atomic_dec_and_test(&eb->blocking_writers))
71
- cond_wake_up_nomb(&eb->write_lock_wq);
72
- } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
73
- BUG_ON(atomic_read(&eb->blocking_readers) == 0);
74
- read_lock(&eb->lock);
75
- atomic_inc(&eb->spinning_readers);
76
- /* atomic_dec_and_test implies a barrier */
77
- if (atomic_dec_and_test(&eb->blocking_readers))
78
- cond_wake_up_nomb(&eb->read_lock_wq);
79
- }
129
+ WARN_ON(eb->spinning_writers != 1);
130
+ eb->spinning_writers--;
80131 }
81132
82
-/*
83
- * take a spinning read lock. This will wait for any blocking
84
- * writers
85
- */
86
-void btrfs_tree_read_lock(struct extent_buffer *eb)
133
+static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
87134 {
88
-again:
89
- BUG_ON(!atomic_read(&eb->blocking_writers) &&
90
- current->pid == eb->lock_owner);
135
+ WARN_ON(eb->spinning_writers);
136
+}
91137
92
- read_lock(&eb->lock);
93
- if (atomic_read(&eb->blocking_writers) &&
94
- current->pid == eb->lock_owner) {
95
- /*
96
- * This extent is already write-locked by our thread. We allow
97
- * an additional read lock to be added because it's for the same
98
- * thread. btrfs_find_all_roots() depends on this as it may be
99
- * called on a partly (write-)locked tree.
100
- */
101
- BUG_ON(eb->lock_nested);
102
- eb->lock_nested = 1;
103
- read_unlock(&eb->lock);
104
- return;
105
- }
106
- if (atomic_read(&eb->blocking_writers)) {
107
- read_unlock(&eb->lock);
108
- wait_event(eb->write_lock_wq,
109
- atomic_read(&eb->blocking_writers) == 0);
110
- goto again;
111
- }
112
- atomic_inc(&eb->read_locks);
138
+static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
139
+{
113140 atomic_inc(&eb->spinning_readers);
114141 }
115142
143
+static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
144
+{
145
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
146
+ atomic_dec(&eb->spinning_readers);
147
+}
148
+
149
+static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
150
+{
151
+ atomic_inc(&eb->read_locks);
152
+}
153
+
154
+static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
155
+{
156
+ atomic_dec(&eb->read_locks);
157
+}
158
+
159
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
160
+{
161
+ BUG_ON(!atomic_read(&eb->read_locks));
162
+}
163
+
164
+static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
165
+{
166
+ eb->write_locks++;
167
+}
168
+
169
+static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
170
+{
171
+ eb->write_locks--;
172
+}
173
+
174
+#else
175
+static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
176
+static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
177
+static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
178
+static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
179
+static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
180
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
181
+static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
182
+static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
183
+static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
184
+static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
185
+#endif
186
+
116187 /*
117
- * take a spinning read lock.
118
- * returns 1 if we get the read lock and 0 if we don't
119
- * this won't wait for blocking writers
188
+ * Mark already held read lock as blocking. Can be nested in write lock by the
189
+ * same thread.
190
+ *
191
+ * Use when there are potentially long operations ahead so other thread waiting
192
+ * on the lock will not actively spin but sleep instead.
193
+ *
194
+ * The rwlock is released and blocking reader counter is increased.
195
+ */
196
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
197
+{
198
+ trace_btrfs_set_lock_blocking_read(eb);
199
+ /*
200
+ * No lock is required. The lock owner may change if we have a read
201
+ * lock, but it won't change to or away from us. If we have the write
202
+ * lock, we are the owner and it'll never change.
203
+ */
204
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
205
+ return;
206
+ btrfs_assert_tree_read_locked(eb);
207
+ atomic_inc(&eb->blocking_readers);
208
+ btrfs_assert_spinning_readers_put(eb);
209
+ read_unlock(&eb->lock);
210
+}
211
+
212
+/*
213
+ * Mark already held write lock as blocking.
214
+ *
215
+ * Use when there are potentially long operations ahead so other threads
216
+ * waiting on the lock will not actively spin but sleep instead.
217
+ *
218
+ * The rwlock is released and blocking writers is set.
219
+ */
220
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
221
+{
222
+ trace_btrfs_set_lock_blocking_write(eb);
223
+ /*
224
+ * No lock is required. The lock owner may change if we have a read
225
+ * lock, but it won't change to or away from us. If we have the write
226
+ * lock, we are the owner and it'll never change.
227
+ */
228
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
229
+ return;
230
+ if (eb->blocking_writers == 0) {
231
+ btrfs_assert_spinning_writers_put(eb);
232
+ btrfs_assert_tree_locked(eb);
233
+ WRITE_ONCE(eb->blocking_writers, 1);
234
+ write_unlock(&eb->lock);
235
+ }
236
+}
237
+
238
+/*
239
+ * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
240
+ * Can be nested in write lock by the same thread.
241
+ *
242
+ * Use when the locked section does only lightweight actions and busy waiting
243
+ * would be cheaper than making other threads do the wait/wake loop.
244
+ *
245
+ * The rwlock is held upon exit.
246
+ */
247
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
248
+ bool recurse)
249
+{
250
+ u64 start_ns = 0;
251
+
252
+ if (trace_btrfs_tree_read_lock_enabled())
253
+ start_ns = ktime_get_ns();
254
+again:
255
+ read_lock(&eb->lock);
256
+ BUG_ON(eb->blocking_writers == 0 &&
257
+ current->pid == eb->lock_owner);
258
+ if (eb->blocking_writers) {
259
+ if (current->pid == eb->lock_owner) {
260
+ /*
261
+ * This extent is already write-locked by our thread.
262
+ * We allow an additional read lock to be added because
263
+ * it's for the same thread. btrfs_find_all_roots()
264
+ * depends on this as it may be called on a partly
265
+ * (write-)locked tree.
266
+ */
267
+ WARN_ON(!recurse);
268
+ BUG_ON(eb->lock_recursed);
269
+ eb->lock_recursed = true;
270
+ read_unlock(&eb->lock);
271
+ trace_btrfs_tree_read_lock(eb, start_ns);
272
+ return;
273
+ }
274
+ read_unlock(&eb->lock);
275
+ wait_event(eb->write_lock_wq,
276
+ READ_ONCE(eb->blocking_writers) == 0);
277
+ goto again;
278
+ }
279
+ btrfs_assert_tree_read_locks_get(eb);
280
+ btrfs_assert_spinning_readers_get(eb);
281
+ trace_btrfs_tree_read_lock(eb, start_ns);
282
+}
283
+
284
+void btrfs_tree_read_lock(struct extent_buffer *eb)
285
+{
286
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
287
+}
288
+
289
+/*
290
+ * Lock extent buffer for read, optimistically expecting that there are no
291
+ * contending blocking writers. If there are, don't wait.
292
+ *
293
+ * Return 1 if the rwlock has been taken, 0 otherwise
120294 */
121295 int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
122296 {
123
- if (atomic_read(&eb->blocking_writers))
297
+ if (READ_ONCE(eb->blocking_writers))
124298 return 0;
125299
126300 read_lock(&eb->lock);
127
- if (atomic_read(&eb->blocking_writers)) {
301
+ /* Refetch value after lock */
302
+ if (READ_ONCE(eb->blocking_writers)) {
128303 read_unlock(&eb->lock);
129304 return 0;
130305 }
131
- atomic_inc(&eb->read_locks);
132
- atomic_inc(&eb->spinning_readers);
306
+ btrfs_assert_tree_read_locks_get(eb);
307
+ btrfs_assert_spinning_readers_get(eb);
308
+ trace_btrfs_tree_read_lock_atomic(eb);
133309 return 1;
134310 }
135311
136312 /*
137
- * returns 1 if we get the read lock and 0 if we don't
138
- * this won't wait for blocking writers
313
+ * Try-lock for read. Don't block or wait for contending writers.
314
+ *
315
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
139316 */
140317 int btrfs_try_tree_read_lock(struct extent_buffer *eb)
141318 {
142
- if (atomic_read(&eb->blocking_writers))
319
+ if (READ_ONCE(eb->blocking_writers))
143320 return 0;
144321
145322 if (!read_trylock(&eb->lock))
146323 return 0;
147324
148
- if (atomic_read(&eb->blocking_writers)) {
325
+ /* Refetch value after lock */
326
+ if (READ_ONCE(eb->blocking_writers)) {
149327 read_unlock(&eb->lock);
150328 return 0;
151329 }
152
- atomic_inc(&eb->read_locks);
153
- atomic_inc(&eb->spinning_readers);
330
+ btrfs_assert_tree_read_locks_get(eb);
331
+ btrfs_assert_spinning_readers_get(eb);
332
+ trace_btrfs_try_tree_read_lock(eb);
154333 return 1;
155334 }
156335
157336 /*
158
- * returns 1 if we get the read lock and 0 if we don't
159
- * this won't wait for blocking writers or readers
337
+ * Try-lock for write. May block until the lock is uncontended, but does not
338
+ * wait until it is free.
339
+ *
340
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
160341 */
161342 int btrfs_try_tree_write_lock(struct extent_buffer *eb)
162343 {
163
- if (atomic_read(&eb->blocking_writers) ||
164
- atomic_read(&eb->blocking_readers))
344
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
165345 return 0;
166346
167347 write_lock(&eb->lock);
168
- if (atomic_read(&eb->blocking_writers) ||
169
- atomic_read(&eb->blocking_readers)) {
348
+ /* Refetch value after lock */
349
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
170350 write_unlock(&eb->lock);
171351 return 0;
172352 }
173
- atomic_inc(&eb->write_locks);
174
- atomic_inc(&eb->spinning_writers);
353
+ btrfs_assert_tree_write_locks_get(eb);
354
+ btrfs_assert_spinning_writers_get(eb);
175355 eb->lock_owner = current->pid;
356
+ trace_btrfs_try_tree_write_lock(eb);
176357 return 1;
177358 }
178359
179360 /*
180
- * drop a spinning read lock
361
+ * Release read lock. Must be used only if the lock is in spinning mode. If
362
+ * the read lock is nested, must pair with read lock before the write unlock.
363
+ *
364
+ * The rwlock is not held upon exit.
181365 */
182366 void btrfs_tree_read_unlock(struct extent_buffer *eb)
183367 {
368
+ trace_btrfs_tree_read_unlock(eb);
184369 /*
185370 * if we're nested, we have the write lock. No new locking
186371 * is needed as long as we are the lock owner.
187
- * The write unlock will do a barrier for us, and the lock_nested
372
+ * The write unlock will do a barrier for us, and the lock_recursed
188373 * field only matters to the lock owner.
189374 */
190
- if (eb->lock_nested && current->pid == eb->lock_owner) {
191
- eb->lock_nested = 0;
375
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
376
+ eb->lock_recursed = false;
192377 return;
193378 }
194379 btrfs_assert_tree_read_locked(eb);
195
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
196
- atomic_dec(&eb->spinning_readers);
197
- atomic_dec(&eb->read_locks);
380
+ btrfs_assert_spinning_readers_put(eb);
381
+ btrfs_assert_tree_read_locks_put(eb);
198382 read_unlock(&eb->lock);
199383 }
200384
201385 /*
202
- * drop a blocking read lock
386
+ * Release read lock, previously set to blocking by a pairing call to
387
+ * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
388
+ * thread.
389
+ *
390
+ * State of rwlock is unchanged, last reader wakes waiting threads.
203391 */
204392 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
205393 {
394
+ trace_btrfs_tree_read_unlock_blocking(eb);
206395 /*
207396 * if we're nested, we have the write lock. No new locking
208397 * is needed as long as we are the lock owner.
209
- * The write unlock will do a barrier for us, and the lock_nested
398
+ * The write unlock will do a barrier for us, and the lock_recursed
210399 * field only matters to the lock owner.
211400 */
212
- if (eb->lock_nested && current->pid == eb->lock_owner) {
213
- eb->lock_nested = 0;
401
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
402
+ eb->lock_recursed = false;
214403 return;
215404 }
216405 btrfs_assert_tree_read_locked(eb);
....@@ -218,70 +407,268 @@
218407 /* atomic_dec_and_test implies a barrier */
219408 if (atomic_dec_and_test(&eb->blocking_readers))
220409 cond_wake_up_nomb(&eb->read_lock_wq);
221
- atomic_dec(&eb->read_locks);
410
+ btrfs_assert_tree_read_locks_put(eb);
222411 }
223412
224413 /*
225
- * take a spinning write lock. This will wait for both
226
- * blocking readers or writers
414
+ * Lock for write. Wait for all blocking and spinning readers and writers. This
415
+ * starts context where reader lock could be nested by the same thread.
416
+ *
417
+ * The rwlock is held for write upon exit.
227418 */
228
-void btrfs_tree_lock(struct extent_buffer *eb)
419
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
420
+ __acquires(&eb->lock)
229421 {
422
+ u64 start_ns = 0;
423
+
424
+ if (trace_btrfs_tree_lock_enabled())
425
+ start_ns = ktime_get_ns();
426
+
230427 WARN_ON(eb->lock_owner == current->pid);
231428 again:
232429 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
233
- wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
430
+ wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
234431 write_lock(&eb->lock);
235
- if (atomic_read(&eb->blocking_readers)) {
432
+ /* Refetch value after lock */
433
+ if (atomic_read(&eb->blocking_readers) ||
434
+ READ_ONCE(eb->blocking_writers)) {
236435 write_unlock(&eb->lock);
237
- wait_event(eb->read_lock_wq,
238
- atomic_read(&eb->blocking_readers) == 0);
239436 goto again;
240437 }
241
- if (atomic_read(&eb->blocking_writers)) {
242
- write_unlock(&eb->lock);
243
- wait_event(eb->write_lock_wq,
244
- atomic_read(&eb->blocking_writers) == 0);
245
- goto again;
246
- }
247
- WARN_ON(atomic_read(&eb->spinning_writers));
248
- atomic_inc(&eb->spinning_writers);
249
- atomic_inc(&eb->write_locks);
438
+ btrfs_assert_spinning_writers_get(eb);
439
+ btrfs_assert_tree_write_locks_get(eb);
250440 eb->lock_owner = current->pid;
441
+ trace_btrfs_tree_lock(eb, start_ns);
442
+}
443
+
444
+void btrfs_tree_lock(struct extent_buffer *eb)
445
+{
446
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
251447 }
252448
253449 /*
254
- * drop a spinning or a blocking write lock.
450
+ * Release the write lock, either blocking or spinning (ie. there's no need
451
+ * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
452
+ * This also ends the context for nesting, the read lock must have been
453
+ * released already.
454
+ *
455
+ * Tasks blocked and waiting are woken, rwlock is not held upon exit.
255456 */
256457 void btrfs_tree_unlock(struct extent_buffer *eb)
257458 {
258
- int blockers = atomic_read(&eb->blocking_writers);
459
+ /*
460
+ * This is read both locked and unlocked but always by the same thread
461
+ * that already owns the lock so we don't need to use READ_ONCE
462
+ */
463
+ int blockers = eb->blocking_writers;
259464
260465 BUG_ON(blockers > 1);
261466
262467 btrfs_assert_tree_locked(eb);
468
+ trace_btrfs_tree_unlock(eb);
263469 eb->lock_owner = 0;
264
- atomic_dec(&eb->write_locks);
470
+ btrfs_assert_tree_write_locks_put(eb);
265471
266472 if (blockers) {
267
- WARN_ON(atomic_read(&eb->spinning_writers));
268
- atomic_dec(&eb->blocking_writers);
269
- /* Use the lighter barrier after atomic */
270
- smp_mb__after_atomic();
271
- cond_wake_up_nomb(&eb->write_lock_wq);
473
+ btrfs_assert_no_spinning_writers(eb);
474
+ /* Unlocked write */
475
+ WRITE_ONCE(eb->blocking_writers, 0);
476
+ /*
477
+ * We need to order modifying blocking_writers above with
478
+ * actually waking up the sleepers to ensure they see the
479
+ * updated value of blocking_writers
480
+ */
481
+ cond_wake_up(&eb->write_lock_wq);
272482 } else {
273
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
274
- atomic_dec(&eb->spinning_writers);
483
+ btrfs_assert_spinning_writers_put(eb);
275484 write_unlock(&eb->lock);
276485 }
277486 }
278487
279
-void btrfs_assert_tree_locked(struct extent_buffer *eb)
488
+/*
489
+ * Set all locked nodes in the path to blocking locks. This should be done
490
+ * before scheduling
491
+ */
492
+void btrfs_set_path_blocking(struct btrfs_path *p)
280493 {
281
- BUG_ON(!atomic_read(&eb->write_locks));
494
+ int i;
495
+
496
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
497
+ if (!p->nodes[i] || !p->locks[i])
498
+ continue;
499
+ /*
500
+ * If we currently have a spinning reader or writer lock this
501
+ * will bump the count of blocking holders and drop the
502
+ * spinlock.
503
+ */
504
+ if (p->locks[i] == BTRFS_READ_LOCK) {
505
+ btrfs_set_lock_blocking_read(p->nodes[i]);
506
+ p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
507
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
508
+ btrfs_set_lock_blocking_write(p->nodes[i]);
509
+ p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
510
+ }
511
+ }
282512 }
283513
284
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
514
+/*
515
+ * This releases any locks held in the path starting at level and going all the
516
+ * way up to the root.
517
+ *
518
+ * btrfs_search_slot will keep the lock held on higher nodes in a few corner
519
+ * cases, such as COW of the block at slot zero in the node. This ignores
520
+ * those rules, and it should only be called when there are no more updates to
521
+ * be done higher up in the tree.
522
+ */
523
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
285524 {
286
- BUG_ON(!atomic_read(&eb->read_locks));
525
+ int i;
526
+
527
+ if (path->keep_locks)
528
+ return;
529
+
530
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
531
+ if (!path->nodes[i])
532
+ continue;
533
+ if (!path->locks[i])
534
+ continue;
535
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
536
+ path->locks[i] = 0;
537
+ }
538
+}
539
+
540
+/*
541
+ * Loop around taking references on and locking the root node of the tree until
542
+ * we end up with a lock on the root node.
543
+ *
544
+ * Return: root extent buffer with write lock held
545
+ */
546
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
547
+{
548
+ struct extent_buffer *eb;
549
+
550
+ while (1) {
551
+ eb = btrfs_root_node(root);
552
+ btrfs_tree_lock(eb);
553
+ if (eb == root->node)
554
+ break;
555
+ btrfs_tree_unlock(eb);
556
+ free_extent_buffer(eb);
557
+ }
558
+ return eb;
559
+}
560
+
561
+/*
562
+ * Loop around taking references on and locking the root node of the tree until
563
+ * we end up with a lock on the root node.
564
+ *
565
+ * Return: root extent buffer with read lock held
566
+ */
567
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
568
+ bool recurse)
569
+{
570
+ struct extent_buffer *eb;
571
+
572
+ while (1) {
573
+ eb = btrfs_root_node(root);
574
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
575
+ if (eb == root->node)
576
+ break;
577
+ btrfs_tree_read_unlock(eb);
578
+ free_extent_buffer(eb);
579
+ }
580
+ return eb;
581
+}
582
+
583
+/*
584
+ * DREW locks
585
+ * ==========
586
+ *
587
+ * DREW stands for double-reader-writer-exclusion lock. It's used in situation
588
+ * where you want to provide A-B exclusion but not AA or BB.
589
+ *
590
+ * Currently implementation gives more priority to reader. If a reader and a
591
+ * writer both race to acquire their respective sides of the lock the writer
592
+ * would yield its lock as soon as it detects a concurrent reader. Additionally
593
+ * if there are pending readers no new writers would be allowed to come in and
594
+ * acquire the lock.
595
+ */
596
+
597
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
598
+{
599
+ int ret;
600
+
601
+ ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
602
+ if (ret)
603
+ return ret;
604
+
605
+ atomic_set(&lock->readers, 0);
606
+ init_waitqueue_head(&lock->pending_readers);
607
+ init_waitqueue_head(&lock->pending_writers);
608
+
609
+ return 0;
610
+}
611
+
612
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
613
+{
614
+ percpu_counter_destroy(&lock->writers);
615
+}
616
+
617
+/* Return true if acquisition is successful, false otherwise */
618
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
619
+{
620
+ if (atomic_read(&lock->readers))
621
+ return false;
622
+
623
+ percpu_counter_inc(&lock->writers);
624
+
625
+ /* Ensure writers count is updated before we check for pending readers */
626
+ smp_mb();
627
+ if (atomic_read(&lock->readers)) {
628
+ btrfs_drew_write_unlock(lock);
629
+ return false;
630
+ }
631
+
632
+ return true;
633
+}
634
+
635
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
636
+{
637
+ while (true) {
638
+ if (btrfs_drew_try_write_lock(lock))
639
+ return;
640
+ wait_event(lock->pending_writers, !atomic_read(&lock->readers));
641
+ }
642
+}
643
+
644
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
645
+{
646
+ percpu_counter_dec(&lock->writers);
647
+ cond_wake_up(&lock->pending_readers);
648
+}
649
+
650
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
651
+{
652
+ atomic_inc(&lock->readers);
653
+
654
+ /*
655
+ * Ensure the pending reader count is perceieved BEFORE this reader
656
+ * goes to sleep in case of active writers. This guarantees new writers
657
+ * won't be allowed and that the current reader will be woken up when
658
+ * the last active writer finishes its jobs.
659
+ */
660
+ smp_mb__after_atomic();
661
+
662
+ wait_event(lock->pending_readers,
663
+ percpu_counter_sum(&lock->writers) == 0);
664
+}
665
+
666
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
667
+{
668
+ /*
669
+ * atomic_dec_and_test implies a full barrier, so woken up writers
670
+ * are guaranteed to see the decrement
671
+ */
672
+ if (atomic_dec_and_test(&lock->readers))
673
+ wake_up(&lock->pending_writers);
287674 }