.. | .. |
---|
47 | 47 | |
---|
48 | 48 | closure_init_stack(&cl); |
---|
49 | 49 | |
---|
50 | | - pr_debug("reading %u", bucket_index); |
---|
| 50 | + pr_debug("reading %u\n", bucket_index); |
---|
51 | 51 | |
---|
52 | 52 | while (offset < ca->sb.bucket_size) { |
---|
53 | 53 | reread: left = ca->sb.bucket_size - offset; |
---|
.. | .. |
---|
78 | 78 | size_t blocks, bytes = set_bytes(j); |
---|
79 | 79 | |
---|
80 | 80 | if (j->magic != jset_magic(&ca->sb)) { |
---|
81 | | - pr_debug("%u: bad magic", bucket_index); |
---|
| 81 | + pr_debug("%u: bad magic\n", bucket_index); |
---|
82 | 82 | return ret; |
---|
83 | 83 | } |
---|
84 | 84 | |
---|
85 | 85 | if (bytes > left << 9 || |
---|
86 | 86 | bytes > PAGE_SIZE << JSET_BITS) { |
---|
87 | | - pr_info("%u: too big, %zu bytes, offset %u", |
---|
| 87 | + pr_info("%u: too big, %zu bytes, offset %u\n", |
---|
88 | 88 | bucket_index, bytes, offset); |
---|
89 | 89 | return ret; |
---|
90 | 90 | } |
---|
.. | .. |
---|
93 | 93 | goto reread; |
---|
94 | 94 | |
---|
95 | 95 | if (j->csum != csum_set(j)) { |
---|
96 | | - pr_info("%u: bad csum, %zu bytes, offset %u", |
---|
| 96 | + pr_info("%u: bad csum, %zu bytes, offset %u\n", |
---|
97 | 97 | bucket_index, bytes, offset); |
---|
98 | 98 | return ret; |
---|
99 | 99 | } |
---|
100 | 100 | |
---|
101 | | - blocks = set_blocks(j, block_bytes(ca->set)); |
---|
| 101 | + blocks = set_blocks(j, block_bytes(ca)); |
---|
102 | 102 | |
---|
| 103 | + /* |
---|
| 104 | + * Nodes in 'list' are in linear increasing order of |
---|
| 105 | + * i->j.seq, the node on head has the smallest (oldest) |
---|
| 106 | + * journal seq, the node on tail has the biggest |
---|
| 107 | + * (latest) journal seq. |
---|
| 108 | + */ |
---|
| 109 | + |
---|
| 110 | + /* |
---|
| 111 | + * Check from the oldest jset for last_seq. If |
---|
| 112 | + * i->j.seq < j->last_seq, it means the oldest jset |
---|
| 113 | + * in list is expired and useless, remove it from |
---|
| 114 | + * this list. Otherwise, j is a condidate jset for |
---|
| 115 | + * further following checks. |
---|
| 116 | + */ |
---|
103 | 117 | while (!list_empty(list)) { |
---|
104 | 118 | i = list_first_entry(list, |
---|
105 | 119 | struct journal_replay, list); |
---|
.. | .. |
---|
109 | 123 | kfree(i); |
---|
110 | 124 | } |
---|
111 | 125 | |
---|
| 126 | + /* iterate list in reverse order (from latest jset) */ |
---|
112 | 127 | list_for_each_entry_reverse(i, list, list) { |
---|
113 | 128 | if (j->seq == i->j.seq) |
---|
114 | 129 | goto next_set; |
---|
115 | 130 | |
---|
| 131 | + /* |
---|
| 132 | + * if j->seq is less than any i->j.last_seq |
---|
| 133 | + * in list, j is an expired and useless jset. |
---|
| 134 | + */ |
---|
116 | 135 | if (j->seq < i->j.last_seq) |
---|
117 | 136 | goto next_set; |
---|
118 | 137 | |
---|
| 138 | + /* |
---|
| 139 | + * 'where' points to first jset in list which |
---|
| 140 | + * is elder then j. |
---|
| 141 | + */ |
---|
119 | 142 | if (j->seq > i->j.seq) { |
---|
120 | 143 | where = &i->list; |
---|
121 | 144 | goto add; |
---|
.. | .. |
---|
129 | 152 | if (!i) |
---|
130 | 153 | return -ENOMEM; |
---|
131 | 154 | memcpy(&i->j, j, bytes); |
---|
| 155 | + /* Add to the location after 'where' points to */ |
---|
132 | 156 | list_add(&i->list, where); |
---|
133 | 157 | ret = 1; |
---|
134 | 158 | |
---|
135 | | - ja->seq[bucket_index] = j->seq; |
---|
| 159 | + if (j->seq > ja->seq[bucket_index]) |
---|
| 160 | + ja->seq[bucket_index] = j->seq; |
---|
136 | 161 | next_set: |
---|
137 | 162 | offset += blocks * ca->sb.block_size; |
---|
138 | 163 | len -= blocks * ca->sb.block_size; |
---|
.. | .. |
---|
147 | 172 | { |
---|
148 | 173 | #define read_bucket(b) \ |
---|
149 | 174 | ({ \ |
---|
150 | | - int ret = journal_read_bucket(ca, list, b); \ |
---|
| 175 | + ret = journal_read_bucket(ca, list, b); \ |
---|
151 | 176 | __set_bit(b, bitmap); \ |
---|
152 | 177 | if (ret < 0) \ |
---|
153 | 178 | return ret; \ |
---|
154 | 179 | ret; \ |
---|
155 | 180 | }) |
---|
156 | 181 | |
---|
157 | | - struct cache *ca; |
---|
158 | | - unsigned int iter; |
---|
| 182 | + struct cache *ca = c->cache; |
---|
| 183 | + int ret = 0; |
---|
| 184 | + struct journal_device *ja = &ca->journal; |
---|
| 185 | + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); |
---|
| 186 | + unsigned int i, l, r, m; |
---|
| 187 | + uint64_t seq; |
---|
159 | 188 | |
---|
160 | | - for_each_cache(ca, c, iter) { |
---|
161 | | - struct journal_device *ja = &ca->journal; |
---|
162 | | - DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); |
---|
163 | | - unsigned int i, l, r, m; |
---|
164 | | - uint64_t seq; |
---|
| 189 | + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); |
---|
| 190 | + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); |
---|
165 | 191 | |
---|
166 | | - bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); |
---|
167 | | - pr_debug("%u journal buckets", ca->sb.njournal_buckets); |
---|
168 | | - |
---|
| 192 | + /* |
---|
| 193 | + * Read journal buckets ordered by golden ratio hash to quickly |
---|
| 194 | + * find a sequence of buckets with valid journal entries |
---|
| 195 | + */ |
---|
| 196 | + for (i = 0; i < ca->sb.njournal_buckets; i++) { |
---|
169 | 197 | /* |
---|
170 | | - * Read journal buckets ordered by golden ratio hash to quickly |
---|
171 | | - * find a sequence of buckets with valid journal entries |
---|
| 198 | + * We must try the index l with ZERO first for |
---|
| 199 | + * correctness due to the scenario that the journal |
---|
| 200 | + * bucket is circular buffer which might have wrapped |
---|
172 | 201 | */ |
---|
173 | | - for (i = 0; i < ca->sb.njournal_buckets; i++) { |
---|
174 | | - /* |
---|
175 | | - * We must try the index l with ZERO first for |
---|
176 | | - * correctness due to the scenario that the journal |
---|
177 | | - * bucket is circular buffer which might have wrapped |
---|
178 | | - */ |
---|
179 | | - l = (i * 2654435769U) % ca->sb.njournal_buckets; |
---|
| 202 | + l = (i * 2654435769U) % ca->sb.njournal_buckets; |
---|
180 | 203 | |
---|
181 | | - if (test_bit(l, bitmap)) |
---|
182 | | - break; |
---|
| 204 | + if (test_bit(l, bitmap)) |
---|
| 205 | + break; |
---|
183 | 206 | |
---|
184 | | - if (read_bucket(l)) |
---|
185 | | - goto bsearch; |
---|
186 | | - } |
---|
187 | | - |
---|
188 | | - /* |
---|
189 | | - * If that fails, check all the buckets we haven't checked |
---|
190 | | - * already |
---|
191 | | - */ |
---|
192 | | - pr_debug("falling back to linear search"); |
---|
193 | | - |
---|
194 | | - for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); |
---|
195 | | - l < ca->sb.njournal_buckets; |
---|
196 | | - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, |
---|
197 | | - l + 1)) |
---|
198 | | - if (read_bucket(l)) |
---|
199 | | - goto bsearch; |
---|
200 | | - |
---|
201 | | - /* no journal entries on this device? */ |
---|
202 | | - if (l == ca->sb.njournal_buckets) |
---|
203 | | - continue; |
---|
204 | | -bsearch: |
---|
205 | | - BUG_ON(list_empty(list)); |
---|
206 | | - |
---|
207 | | - /* Binary search */ |
---|
208 | | - m = l; |
---|
209 | | - r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); |
---|
210 | | - pr_debug("starting binary search, l %u r %u", l, r); |
---|
211 | | - |
---|
212 | | - while (l + 1 < r) { |
---|
213 | | - seq = list_entry(list->prev, struct journal_replay, |
---|
214 | | - list)->j.seq; |
---|
215 | | - |
---|
216 | | - m = (l + r) >> 1; |
---|
217 | | - read_bucket(m); |
---|
218 | | - |
---|
219 | | - if (seq != list_entry(list->prev, struct journal_replay, |
---|
220 | | - list)->j.seq) |
---|
221 | | - l = m; |
---|
222 | | - else |
---|
223 | | - r = m; |
---|
224 | | - } |
---|
225 | | - |
---|
226 | | - /* |
---|
227 | | - * Read buckets in reverse order until we stop finding more |
---|
228 | | - * journal entries |
---|
229 | | - */ |
---|
230 | | - pr_debug("finishing up: m %u njournal_buckets %u", |
---|
231 | | - m, ca->sb.njournal_buckets); |
---|
232 | | - l = m; |
---|
233 | | - |
---|
234 | | - while (1) { |
---|
235 | | - if (!l--) |
---|
236 | | - l = ca->sb.njournal_buckets - 1; |
---|
237 | | - |
---|
238 | | - if (l == m) |
---|
239 | | - break; |
---|
240 | | - |
---|
241 | | - if (test_bit(l, bitmap)) |
---|
242 | | - continue; |
---|
243 | | - |
---|
244 | | - if (!read_bucket(l)) |
---|
245 | | - break; |
---|
246 | | - } |
---|
247 | | - |
---|
248 | | - seq = 0; |
---|
249 | | - |
---|
250 | | - for (i = 0; i < ca->sb.njournal_buckets; i++) |
---|
251 | | - if (ja->seq[i] > seq) { |
---|
252 | | - seq = ja->seq[i]; |
---|
253 | | - /* |
---|
254 | | - * When journal_reclaim() goes to allocate for |
---|
255 | | - * the first time, it'll use the bucket after |
---|
256 | | - * ja->cur_idx |
---|
257 | | - */ |
---|
258 | | - ja->cur_idx = i; |
---|
259 | | - ja->last_idx = ja->discard_idx = (i + 1) % |
---|
260 | | - ca->sb.njournal_buckets; |
---|
261 | | - |
---|
262 | | - } |
---|
| 207 | + if (read_bucket(l)) |
---|
| 208 | + goto bsearch; |
---|
263 | 209 | } |
---|
264 | 210 | |
---|
| 211 | + /* |
---|
| 212 | + * If that fails, check all the buckets we haven't checked |
---|
| 213 | + * already |
---|
| 214 | + */ |
---|
| 215 | + pr_debug("falling back to linear search\n"); |
---|
| 216 | + |
---|
| 217 | + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) |
---|
| 218 | + if (read_bucket(l)) |
---|
| 219 | + goto bsearch; |
---|
| 220 | + |
---|
| 221 | + /* no journal entries on this device? */ |
---|
| 222 | + if (l == ca->sb.njournal_buckets) |
---|
| 223 | + goto out; |
---|
| 224 | +bsearch: |
---|
| 225 | + BUG_ON(list_empty(list)); |
---|
| 226 | + |
---|
| 227 | + /* Binary search */ |
---|
| 228 | + m = l; |
---|
| 229 | + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); |
---|
| 230 | + pr_debug("starting binary search, l %u r %u\n", l, r); |
---|
| 231 | + |
---|
| 232 | + while (l + 1 < r) { |
---|
| 233 | + seq = list_entry(list->prev, struct journal_replay, |
---|
| 234 | + list)->j.seq; |
---|
| 235 | + |
---|
| 236 | + m = (l + r) >> 1; |
---|
| 237 | + read_bucket(m); |
---|
| 238 | + |
---|
| 239 | + if (seq != list_entry(list->prev, struct journal_replay, |
---|
| 240 | + list)->j.seq) |
---|
| 241 | + l = m; |
---|
| 242 | + else |
---|
| 243 | + r = m; |
---|
| 244 | + } |
---|
| 245 | + |
---|
| 246 | + /* |
---|
| 247 | + * Read buckets in reverse order until we stop finding more |
---|
| 248 | + * journal entries |
---|
| 249 | + */ |
---|
| 250 | + pr_debug("finishing up: m %u njournal_buckets %u\n", |
---|
| 251 | + m, ca->sb.njournal_buckets); |
---|
| 252 | + l = m; |
---|
| 253 | + |
---|
| 254 | + while (1) { |
---|
| 255 | + if (!l--) |
---|
| 256 | + l = ca->sb.njournal_buckets - 1; |
---|
| 257 | + |
---|
| 258 | + if (l == m) |
---|
| 259 | + break; |
---|
| 260 | + |
---|
| 261 | + if (test_bit(l, bitmap)) |
---|
| 262 | + continue; |
---|
| 263 | + |
---|
| 264 | + if (!read_bucket(l)) |
---|
| 265 | + break; |
---|
| 266 | + } |
---|
| 267 | + |
---|
| 268 | + seq = 0; |
---|
| 269 | + |
---|
| 270 | + for (i = 0; i < ca->sb.njournal_buckets; i++) |
---|
| 271 | + if (ja->seq[i] > seq) { |
---|
| 272 | + seq = ja->seq[i]; |
---|
| 273 | + /* |
---|
| 274 | + * When journal_reclaim() goes to allocate for |
---|
| 275 | + * the first time, it'll use the bucket after |
---|
| 276 | + * ja->cur_idx |
---|
| 277 | + */ |
---|
| 278 | + ja->cur_idx = i; |
---|
| 279 | + ja->last_idx = ja->discard_idx = (i + 1) % |
---|
| 280 | + ca->sb.njournal_buckets; |
---|
| 281 | + |
---|
| 282 | + } |
---|
| 283 | + |
---|
| 284 | +out: |
---|
265 | 285 | if (!list_empty(list)) |
---|
266 | 286 | c->journal.seq = list_entry(list->prev, |
---|
267 | 287 | struct journal_replay, |
---|
.. | .. |
---|
317 | 337 | } |
---|
318 | 338 | } |
---|
319 | 339 | |
---|
320 | | -bool is_discard_enabled(struct cache_set *s) |
---|
| 340 | +static bool is_discard_enabled(struct cache_set *s) |
---|
321 | 341 | { |
---|
322 | | - struct cache *ca; |
---|
323 | | - unsigned int i; |
---|
| 342 | + struct cache *ca = s->cache; |
---|
324 | 343 | |
---|
325 | | - for_each_cache(ca, s, i) |
---|
326 | | - if (ca->discard) |
---|
327 | | - return true; |
---|
| 344 | + if (ca->discard) |
---|
| 345 | + return true; |
---|
328 | 346 | |
---|
329 | 347 | return false; |
---|
330 | 348 | } |
---|
.. | .. |
---|
344 | 362 | |
---|
345 | 363 | if (n != i->j.seq) { |
---|
346 | 364 | if (n == start && is_discard_enabled(s)) |
---|
347 | | - pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)", |
---|
| 365 | + pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", |
---|
348 | 366 | n, i->j.seq - 1, start, end); |
---|
349 | 367 | else { |
---|
350 | | - pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", |
---|
| 368 | + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", |
---|
351 | 369 | n, i->j.seq - 1, start, end); |
---|
352 | 370 | ret = -EIO; |
---|
353 | 371 | goto err; |
---|
.. | .. |
---|
377 | 395 | entries++; |
---|
378 | 396 | } |
---|
379 | 397 | |
---|
380 | | - pr_info("journal replay done, %i keys in %i entries, seq %llu", |
---|
| 398 | + pr_info("journal replay done, %i keys in %i entries, seq %llu\n", |
---|
381 | 399 | keys, entries, end); |
---|
382 | 400 | err: |
---|
383 | 401 | while (!list_empty(list)) { |
---|
.. | .. |
---|
389 | 407 | return ret; |
---|
390 | 408 | } |
---|
391 | 409 | |
---|
| 410 | +void bch_journal_space_reserve(struct journal *j) |
---|
| 411 | +{ |
---|
| 412 | + j->do_reserve = true; |
---|
| 413 | +} |
---|
| 414 | + |
---|
392 | 415 | /* Journalling */ |
---|
393 | 416 | |
---|
394 | 417 | static void btree_flush_write(struct cache_set *c) |
---|
395 | 418 | { |
---|
396 | | - /* |
---|
397 | | - * Try to find the btree node with that references the oldest journal |
---|
398 | | - * entry, best is our current candidate and is locked if non NULL: |
---|
399 | | - */ |
---|
400 | | - struct btree *b, *best; |
---|
401 | | - unsigned int i; |
---|
| 419 | + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; |
---|
| 420 | + unsigned int i, nr; |
---|
| 421 | + int ref_nr; |
---|
| 422 | + atomic_t *fifo_front_p, *now_fifo_front_p; |
---|
| 423 | + size_t mask; |
---|
402 | 424 | |
---|
| 425 | + if (c->journal.btree_flushing) |
---|
| 426 | + return; |
---|
| 427 | + |
---|
| 428 | + spin_lock(&c->journal.flush_write_lock); |
---|
| 429 | + if (c->journal.btree_flushing) { |
---|
| 430 | + spin_unlock(&c->journal.flush_write_lock); |
---|
| 431 | + return; |
---|
| 432 | + } |
---|
| 433 | + c->journal.btree_flushing = true; |
---|
| 434 | + spin_unlock(&c->journal.flush_write_lock); |
---|
| 435 | + |
---|
| 436 | + /* get the oldest journal entry and check its refcount */ |
---|
| 437 | + spin_lock(&c->journal.lock); |
---|
| 438 | + fifo_front_p = &fifo_front(&c->journal.pin); |
---|
| 439 | + ref_nr = atomic_read(fifo_front_p); |
---|
| 440 | + if (ref_nr <= 0) { |
---|
| 441 | + /* |
---|
| 442 | + * do nothing if no btree node references |
---|
| 443 | + * the oldest journal entry |
---|
| 444 | + */ |
---|
| 445 | + spin_unlock(&c->journal.lock); |
---|
| 446 | + goto out; |
---|
| 447 | + } |
---|
| 448 | + spin_unlock(&c->journal.lock); |
---|
| 449 | + |
---|
| 450 | + mask = c->journal.pin.mask; |
---|
| 451 | + nr = 0; |
---|
403 | 452 | atomic_long_inc(&c->flush_write); |
---|
404 | | -retry: |
---|
405 | | - best = NULL; |
---|
| 453 | + memset(btree_nodes, 0, sizeof(btree_nodes)); |
---|
406 | 454 | |
---|
407 | 455 | mutex_lock(&c->bucket_lock); |
---|
408 | | - for_each_cached_btree(b, c, i) |
---|
409 | | - if (btree_current_write(b)->journal) { |
---|
410 | | - if (!best) |
---|
411 | | - best = b; |
---|
412 | | - else if (journal_pin_cmp(c, |
---|
413 | | - btree_current_write(best)->journal, |
---|
414 | | - btree_current_write(b)->journal)) { |
---|
415 | | - best = b; |
---|
416 | | - } |
---|
| 456 | + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { |
---|
| 457 | + /* |
---|
| 458 | + * It is safe to get now_fifo_front_p without holding |
---|
| 459 | + * c->journal.lock here, because we don't need to know |
---|
| 460 | + * the exactly accurate value, just check whether the |
---|
| 461 | + * front pointer of c->journal.pin is changed. |
---|
| 462 | + */ |
---|
| 463 | + now_fifo_front_p = &fifo_front(&c->journal.pin); |
---|
| 464 | + /* |
---|
| 465 | + * If the oldest journal entry is reclaimed and front |
---|
| 466 | + * pointer of c->journal.pin changes, it is unnecessary |
---|
| 467 | + * to scan c->btree_cache anymore, just quit the loop and |
---|
| 468 | + * flush out what we have already. |
---|
| 469 | + */ |
---|
| 470 | + if (now_fifo_front_p != fifo_front_p) |
---|
| 471 | + break; |
---|
| 472 | + /* |
---|
| 473 | + * quit this loop if all matching btree nodes are |
---|
| 474 | + * scanned and record in btree_nodes[] already. |
---|
| 475 | + */ |
---|
| 476 | + ref_nr = atomic_read(fifo_front_p); |
---|
| 477 | + if (nr >= ref_nr) |
---|
| 478 | + break; |
---|
| 479 | + |
---|
| 480 | + if (btree_node_journal_flush(b)) |
---|
| 481 | + pr_err("BUG: flush_write bit should not be set here!\n"); |
---|
| 482 | + |
---|
| 483 | + mutex_lock(&b->write_lock); |
---|
| 484 | + |
---|
| 485 | + if (!btree_node_dirty(b)) { |
---|
| 486 | + mutex_unlock(&b->write_lock); |
---|
| 487 | + continue; |
---|
417 | 488 | } |
---|
418 | 489 | |
---|
419 | | - b = best; |
---|
420 | | - if (b) |
---|
| 490 | + if (!btree_current_write(b)->journal) { |
---|
| 491 | + mutex_unlock(&b->write_lock); |
---|
| 492 | + continue; |
---|
| 493 | + } |
---|
| 494 | + |
---|
| 495 | + /* |
---|
| 496 | + * Only select the btree node which exactly references |
---|
| 497 | + * the oldest journal entry. |
---|
| 498 | + * |
---|
| 499 | + * If the journal entry pointed by fifo_front_p is |
---|
| 500 | + * reclaimed in parallel, don't worry: |
---|
| 501 | + * - the list_for_each_xxx loop will quit when checking |
---|
| 502 | + * next now_fifo_front_p. |
---|
| 503 | + * - If there are matched nodes recorded in btree_nodes[], |
---|
| 504 | + * they are clean now (this is why and how the oldest |
---|
| 505 | + * journal entry can be reclaimed). These selected nodes |
---|
| 506 | + * will be ignored and skipped in the folowing for-loop. |
---|
| 507 | + */ |
---|
| 508 | + if (((btree_current_write(b)->journal - fifo_front_p) & |
---|
| 509 | + mask) != 0) { |
---|
| 510 | + mutex_unlock(&b->write_lock); |
---|
| 511 | + continue; |
---|
| 512 | + } |
---|
| 513 | + |
---|
421 | 514 | set_btree_node_journal_flush(b); |
---|
| 515 | + |
---|
| 516 | + mutex_unlock(&b->write_lock); |
---|
| 517 | + |
---|
| 518 | + btree_nodes[nr++] = b; |
---|
| 519 | + /* |
---|
| 520 | + * To avoid holding c->bucket_lock too long time, |
---|
| 521 | + * only scan for BTREE_FLUSH_NR matched btree nodes |
---|
| 522 | + * at most. If there are more btree nodes reference |
---|
| 523 | + * the oldest journal entry, try to flush them next |
---|
| 524 | + * time when btree_flush_write() is called. |
---|
| 525 | + */ |
---|
| 526 | + if (nr == BTREE_FLUSH_NR) |
---|
| 527 | + break; |
---|
| 528 | + } |
---|
422 | 529 | mutex_unlock(&c->bucket_lock); |
---|
423 | 530 | |
---|
424 | | - if (b) { |
---|
| 531 | + for (i = 0; i < nr; i++) { |
---|
| 532 | + b = btree_nodes[i]; |
---|
| 533 | + if (!b) { |
---|
| 534 | + pr_err("BUG: btree_nodes[%d] is NULL\n", i); |
---|
| 535 | + continue; |
---|
| 536 | + } |
---|
| 537 | + |
---|
| 538 | + /* safe to check without holding b->write_lock */ |
---|
| 539 | + if (!btree_node_journal_flush(b)) { |
---|
| 540 | + pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b); |
---|
| 541 | + continue; |
---|
| 542 | + } |
---|
| 543 | + |
---|
425 | 544 | mutex_lock(&b->write_lock); |
---|
426 | 545 | if (!btree_current_write(b)->journal) { |
---|
427 | 546 | clear_bit(BTREE_NODE_journal_flush, &b->flags); |
---|
428 | 547 | mutex_unlock(&b->write_lock); |
---|
429 | | - /* We raced */ |
---|
430 | | - atomic_long_inc(&c->retry_flush_write); |
---|
431 | | - goto retry; |
---|
| 548 | + pr_debug("bnode %p: written by others\n", b); |
---|
| 549 | + continue; |
---|
| 550 | + } |
---|
| 551 | + |
---|
| 552 | + if (!btree_node_dirty(b)) { |
---|
| 553 | + clear_bit(BTREE_NODE_journal_flush, &b->flags); |
---|
| 554 | + mutex_unlock(&b->write_lock); |
---|
| 555 | + pr_debug("bnode %p: dirty bit cleaned by others\n", b); |
---|
| 556 | + continue; |
---|
432 | 557 | } |
---|
433 | 558 | |
---|
434 | 559 | __bch_btree_node_write(b, NULL); |
---|
435 | 560 | clear_bit(BTREE_NODE_journal_flush, &b->flags); |
---|
436 | 561 | mutex_unlock(&b->write_lock); |
---|
437 | 562 | } |
---|
| 563 | + |
---|
| 564 | +out: |
---|
| 565 | + spin_lock(&c->journal.flush_write_lock); |
---|
| 566 | + c->journal.btree_flushing = false; |
---|
| 567 | + spin_unlock(&c->journal.flush_write_lock); |
---|
438 | 568 | } |
---|
439 | 569 | |
---|
440 | 570 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) |
---|
.. | .. |
---|
478 | 608 | ca->sb.njournal_buckets; |
---|
479 | 609 | |
---|
480 | 610 | atomic_set(&ja->discard_in_flight, DISCARD_READY); |
---|
481 | | - /* fallthrough */ |
---|
| 611 | + fallthrough; |
---|
482 | 612 | |
---|
483 | 613 | case DISCARD_READY: |
---|
484 | 614 | if (ja->discard_idx == ja->last_idx) |
---|
.. | .. |
---|
500 | 630 | } |
---|
501 | 631 | } |
---|
502 | 632 | |
---|
| 633 | +static unsigned int free_journal_buckets(struct cache_set *c) |
---|
| 634 | +{ |
---|
| 635 | + struct journal *j = &c->journal; |
---|
| 636 | + struct cache *ca = c->cache; |
---|
| 637 | + struct journal_device *ja = &c->cache->journal; |
---|
| 638 | + unsigned int n; |
---|
| 639 | + |
---|
| 640 | + /* In case njournal_buckets is not power of 2 */ |
---|
| 641 | + if (ja->cur_idx >= ja->discard_idx) |
---|
| 642 | + n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; |
---|
| 643 | + else |
---|
| 644 | + n = ja->discard_idx - ja->cur_idx; |
---|
| 645 | + |
---|
| 646 | + if (n > (1 + j->do_reserve)) |
---|
| 647 | + return n - (1 + j->do_reserve); |
---|
| 648 | + |
---|
| 649 | + return 0; |
---|
| 650 | +} |
---|
| 651 | + |
---|
503 | 652 | static void journal_reclaim(struct cache_set *c) |
---|
504 | 653 | { |
---|
505 | 654 | struct bkey *k = &c->journal.key; |
---|
506 | | - struct cache *ca; |
---|
| 655 | + struct cache *ca = c->cache; |
---|
507 | 656 | uint64_t last_seq; |
---|
508 | | - unsigned int iter, n = 0; |
---|
| 657 | + struct journal_device *ja = &ca->journal; |
---|
509 | 658 | atomic_t p __maybe_unused; |
---|
510 | 659 | |
---|
511 | 660 | atomic_long_inc(&c->reclaim); |
---|
.. | .. |
---|
517 | 666 | |
---|
518 | 667 | /* Update last_idx */ |
---|
519 | 668 | |
---|
520 | | - for_each_cache(ca, c, iter) { |
---|
521 | | - struct journal_device *ja = &ca->journal; |
---|
| 669 | + while (ja->last_idx != ja->cur_idx && |
---|
| 670 | + ja->seq[ja->last_idx] < last_seq) |
---|
| 671 | + ja->last_idx = (ja->last_idx + 1) % |
---|
| 672 | + ca->sb.njournal_buckets; |
---|
522 | 673 | |
---|
523 | | - while (ja->last_idx != ja->cur_idx && |
---|
524 | | - ja->seq[ja->last_idx] < last_seq) |
---|
525 | | - ja->last_idx = (ja->last_idx + 1) % |
---|
526 | | - ca->sb.njournal_buckets; |
---|
527 | | - } |
---|
528 | | - |
---|
529 | | - for_each_cache(ca, c, iter) |
---|
530 | | - do_journal_discard(ca); |
---|
| 674 | + do_journal_discard(ca); |
---|
531 | 675 | |
---|
532 | 676 | if (c->journal.blocks_free) |
---|
533 | 677 | goto out; |
---|
534 | 678 | |
---|
535 | | - /* |
---|
536 | | - * Allocate: |
---|
537 | | - * XXX: Sort by free journal space |
---|
538 | | - */ |
---|
| 679 | + if (!free_journal_buckets(c)) |
---|
| 680 | + goto out; |
---|
539 | 681 | |
---|
540 | | - for_each_cache(ca, c, iter) { |
---|
541 | | - struct journal_device *ja = &ca->journal; |
---|
542 | | - unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; |
---|
| 682 | + ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; |
---|
| 683 | + k->ptr[0] = MAKE_PTR(0, |
---|
| 684 | + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), |
---|
| 685 | + ca->sb.nr_this_dev); |
---|
| 686 | + atomic_long_inc(&c->reclaimed_journal_buckets); |
---|
543 | 687 | |
---|
544 | | - /* No space available on this device */ |
---|
545 | | - if (next == ja->discard_idx) |
---|
546 | | - continue; |
---|
| 688 | + bkey_init(k); |
---|
| 689 | + SET_KEY_PTRS(k, 1); |
---|
| 690 | + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; |
---|
547 | 691 | |
---|
548 | | - ja->cur_idx = next; |
---|
549 | | - k->ptr[n++] = MAKE_PTR(0, |
---|
550 | | - bucket_to_sector(c, ca->sb.d[ja->cur_idx]), |
---|
551 | | - ca->sb.nr_this_dev); |
---|
552 | | - } |
---|
553 | | - |
---|
554 | | - if (n) { |
---|
555 | | - bkey_init(k); |
---|
556 | | - SET_KEY_PTRS(k, n); |
---|
557 | | - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; |
---|
558 | | - } |
---|
559 | 692 | out: |
---|
560 | 693 | if (!journal_full(&c->journal)) |
---|
561 | 694 | __closure_wake_up(&c->journal.wait); |
---|
.. | .. |
---|
582 | 715 | j->cur->data->keys = 0; |
---|
583 | 716 | |
---|
584 | 717 | if (fifo_full(&j->pin)) |
---|
585 | | - pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); |
---|
| 718 | + pr_debug("journal_pin full (%zu)\n", fifo_used(&j->pin)); |
---|
586 | 719 | } |
---|
587 | 720 | |
---|
588 | 721 | static void journal_write_endio(struct bio *bio) |
---|
.. | .. |
---|
619 | 752 | __releases(c->journal.lock) |
---|
620 | 753 | { |
---|
621 | 754 | struct cache_set *c = container_of(cl, struct cache_set, journal.io); |
---|
622 | | - struct cache *ca; |
---|
| 755 | + struct cache *ca = c->cache; |
---|
623 | 756 | struct journal_write *w = c->journal.cur; |
---|
624 | 757 | struct bkey *k = &c->journal.key; |
---|
625 | | - unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * |
---|
626 | | - c->sb.block_size; |
---|
| 758 | + unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * |
---|
| 759 | + ca->sb.block_size; |
---|
627 | 760 | |
---|
628 | 761 | struct bio *bio; |
---|
629 | 762 | struct bio_list list; |
---|
.. | .. |
---|
642 | 775 | return; |
---|
643 | 776 | } |
---|
644 | 777 | |
---|
645 | | - c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); |
---|
| 778 | + c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); |
---|
646 | 779 | |
---|
647 | 780 | w->data->btree_level = c->root->level; |
---|
648 | 781 | |
---|
649 | 782 | bkey_copy(&w->data->btree_root, &c->root->key); |
---|
650 | 783 | bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); |
---|
651 | 784 | |
---|
652 | | - for_each_cache(ca, c, i) |
---|
653 | | - w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; |
---|
654 | | - |
---|
655 | | - w->data->magic = jset_magic(&c->sb); |
---|
| 785 | + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; |
---|
| 786 | + w->data->magic = jset_magic(&ca->sb); |
---|
656 | 787 | w->data->version = BCACHE_JSET_VERSION; |
---|
657 | 788 | w->data->last_seq = last_seq(&c->journal); |
---|
658 | 789 | w->data->csum = csum_set(w->data); |
---|
.. | .. |
---|
674 | 805 | REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); |
---|
675 | 806 | bch_bio_map(bio, w->data); |
---|
676 | 807 | |
---|
677 | | - trace_bcache_journal_write(bio); |
---|
| 808 | + trace_bcache_journal_write(bio, w->data->keys); |
---|
678 | 809 | bio_list_add(&list, bio); |
---|
679 | 810 | |
---|
680 | 811 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); |
---|
.. | .. |
---|
728 | 859 | size_t sectors; |
---|
729 | 860 | struct closure cl; |
---|
730 | 861 | bool wait = false; |
---|
| 862 | + struct cache *ca = c->cache; |
---|
731 | 863 | |
---|
732 | 864 | closure_init_stack(&cl); |
---|
733 | 865 | |
---|
.. | .. |
---|
737 | 869 | struct journal_write *w = c->journal.cur; |
---|
738 | 870 | |
---|
739 | 871 | sectors = __set_blocks(w->data, w->data->keys + nkeys, |
---|
740 | | - block_bytes(c)) * c->sb.block_size; |
---|
| 872 | + block_bytes(ca)) * ca->sb.block_size; |
---|
741 | 873 | |
---|
742 | 874 | if (sectors <= min_t(size_t, |
---|
743 | | - c->journal.blocks_free * c->sb.block_size, |
---|
| 875 | + c->journal.blocks_free * ca->sb.block_size, |
---|
744 | 876 | PAGE_SECTORS << JSET_BITS)) |
---|
745 | 877 | return w; |
---|
746 | 878 | |
---|
.. | .. |
---|
805 | 937 | if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) |
---|
806 | 938 | return NULL; |
---|
807 | 939 | |
---|
808 | | - if (!CACHE_SYNC(&c->sb)) |
---|
| 940 | + if (!CACHE_SYNC(&c->cache->sb)) |
---|
809 | 941 | return NULL; |
---|
810 | 942 | |
---|
811 | 943 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); |
---|
.. | .. |
---|
821 | 953 | journal_try_write(c); |
---|
822 | 954 | } else if (!w->dirty) { |
---|
823 | 955 | w->dirty = true; |
---|
824 | | - schedule_delayed_work(&c->journal.work, |
---|
825 | | - msecs_to_jiffies(c->journal_delay_ms)); |
---|
| 956 | + queue_delayed_work(bch_flush_wq, &c->journal.work, |
---|
| 957 | + msecs_to_jiffies(c->journal_delay_ms)); |
---|
826 | 958 | spin_unlock(&c->journal.lock); |
---|
827 | 959 | } else { |
---|
828 | 960 | spin_unlock(&c->journal.lock); |
---|
.. | .. |
---|
856 | 988 | struct journal *j = &c->journal; |
---|
857 | 989 | |
---|
858 | 990 | spin_lock_init(&j->lock); |
---|
| 991 | + spin_lock_init(&j->flush_write_lock); |
---|
859 | 992 | INIT_DELAYED_WORK(&j->work, journal_write_work); |
---|
860 | 993 | |
---|
861 | 994 | c->journal_delay_ms = 100; |
---|