| .. | .. |
|---|
| 47 | 47 | |
|---|
| 48 | 48 | closure_init_stack(&cl); |
|---|
| 49 | 49 | |
|---|
| 50 | | - pr_debug("reading %u", bucket_index); |
|---|
| 50 | + pr_debug("reading %u\n", bucket_index); |
|---|
| 51 | 51 | |
|---|
| 52 | 52 | while (offset < ca->sb.bucket_size) { |
|---|
| 53 | 53 | reread: left = ca->sb.bucket_size - offset; |
|---|
| .. | .. |
|---|
| 78 | 78 | size_t blocks, bytes = set_bytes(j); |
|---|
| 79 | 79 | |
|---|
| 80 | 80 | if (j->magic != jset_magic(&ca->sb)) { |
|---|
| 81 | | - pr_debug("%u: bad magic", bucket_index); |
|---|
| 81 | + pr_debug("%u: bad magic\n", bucket_index); |
|---|
| 82 | 82 | return ret; |
|---|
| 83 | 83 | } |
|---|
| 84 | 84 | |
|---|
| 85 | 85 | if (bytes > left << 9 || |
|---|
| 86 | 86 | bytes > PAGE_SIZE << JSET_BITS) { |
|---|
| 87 | | - pr_info("%u: too big, %zu bytes, offset %u", |
|---|
| 87 | + pr_info("%u: too big, %zu bytes, offset %u\n", |
|---|
| 88 | 88 | bucket_index, bytes, offset); |
|---|
| 89 | 89 | return ret; |
|---|
| 90 | 90 | } |
|---|
| .. | .. |
|---|
| 93 | 93 | goto reread; |
|---|
| 94 | 94 | |
|---|
| 95 | 95 | if (j->csum != csum_set(j)) { |
|---|
| 96 | | - pr_info("%u: bad csum, %zu bytes, offset %u", |
|---|
| 96 | + pr_info("%u: bad csum, %zu bytes, offset %u\n", |
|---|
| 97 | 97 | bucket_index, bytes, offset); |
|---|
| 98 | 98 | return ret; |
|---|
| 99 | 99 | } |
|---|
| 100 | 100 | |
|---|
| 101 | | - blocks = set_blocks(j, block_bytes(ca->set)); |
|---|
| 101 | + blocks = set_blocks(j, block_bytes(ca)); |
|---|
| 102 | 102 | |
|---|
| 103 | + /* |
|---|
| 104 | + * Nodes in 'list' are in linear increasing order of |
|---|
| 105 | + * i->j.seq, the node on head has the smallest (oldest) |
|---|
| 106 | + * journal seq, the node on tail has the biggest |
|---|
| 107 | + * (latest) journal seq. |
|---|
| 108 | + */ |
|---|
| 109 | + |
|---|
| 110 | + /* |
|---|
| 111 | + * Check from the oldest jset for last_seq. If |
|---|
| 112 | + * i->j.seq < j->last_seq, it means the oldest jset |
|---|
| 113 | + * in list is expired and useless, remove it from |
|---|
| 114 | + * this list. Otherwise, j is a condidate jset for |
|---|
| 115 | + * further following checks. |
|---|
| 116 | + */ |
|---|
| 103 | 117 | while (!list_empty(list)) { |
|---|
| 104 | 118 | i = list_first_entry(list, |
|---|
| 105 | 119 | struct journal_replay, list); |
|---|
| .. | .. |
|---|
| 109 | 123 | kfree(i); |
|---|
| 110 | 124 | } |
|---|
| 111 | 125 | |
|---|
| 126 | + /* iterate list in reverse order (from latest jset) */ |
|---|
| 112 | 127 | list_for_each_entry_reverse(i, list, list) { |
|---|
| 113 | 128 | if (j->seq == i->j.seq) |
|---|
| 114 | 129 | goto next_set; |
|---|
| 115 | 130 | |
|---|
| 131 | + /* |
|---|
| 132 | + * if j->seq is less than any i->j.last_seq |
|---|
| 133 | + * in list, j is an expired and useless jset. |
|---|
| 134 | + */ |
|---|
| 116 | 135 | if (j->seq < i->j.last_seq) |
|---|
| 117 | 136 | goto next_set; |
|---|
| 118 | 137 | |
|---|
| 138 | + /* |
|---|
| 139 | + * 'where' points to first jset in list which |
|---|
| 140 | + * is elder then j. |
|---|
| 141 | + */ |
|---|
| 119 | 142 | if (j->seq > i->j.seq) { |
|---|
| 120 | 143 | where = &i->list; |
|---|
| 121 | 144 | goto add; |
|---|
| .. | .. |
|---|
| 129 | 152 | if (!i) |
|---|
| 130 | 153 | return -ENOMEM; |
|---|
| 131 | 154 | memcpy(&i->j, j, bytes); |
|---|
| 155 | + /* Add to the location after 'where' points to */ |
|---|
| 132 | 156 | list_add(&i->list, where); |
|---|
| 133 | 157 | ret = 1; |
|---|
| 134 | 158 | |
|---|
| 135 | | - ja->seq[bucket_index] = j->seq; |
|---|
| 159 | + if (j->seq > ja->seq[bucket_index]) |
|---|
| 160 | + ja->seq[bucket_index] = j->seq; |
|---|
| 136 | 161 | next_set: |
|---|
| 137 | 162 | offset += blocks * ca->sb.block_size; |
|---|
| 138 | 163 | len -= blocks * ca->sb.block_size; |
|---|
| .. | .. |
|---|
| 147 | 172 | { |
|---|
| 148 | 173 | #define read_bucket(b) \ |
|---|
| 149 | 174 | ({ \ |
|---|
| 150 | | - int ret = journal_read_bucket(ca, list, b); \ |
|---|
| 175 | + ret = journal_read_bucket(ca, list, b); \ |
|---|
| 151 | 176 | __set_bit(b, bitmap); \ |
|---|
| 152 | 177 | if (ret < 0) \ |
|---|
| 153 | 178 | return ret; \ |
|---|
| 154 | 179 | ret; \ |
|---|
| 155 | 180 | }) |
|---|
| 156 | 181 | |
|---|
| 157 | | - struct cache *ca; |
|---|
| 158 | | - unsigned int iter; |
|---|
| 182 | + struct cache *ca = c->cache; |
|---|
| 183 | + int ret = 0; |
|---|
| 184 | + struct journal_device *ja = &ca->journal; |
|---|
| 185 | + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); |
|---|
| 186 | + unsigned int i, l, r, m; |
|---|
| 187 | + uint64_t seq; |
|---|
| 159 | 188 | |
|---|
| 160 | | - for_each_cache(ca, c, iter) { |
|---|
| 161 | | - struct journal_device *ja = &ca->journal; |
|---|
| 162 | | - DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); |
|---|
| 163 | | - unsigned int i, l, r, m; |
|---|
| 164 | | - uint64_t seq; |
|---|
| 189 | + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); |
|---|
| 190 | + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); |
|---|
| 165 | 191 | |
|---|
| 166 | | - bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); |
|---|
| 167 | | - pr_debug("%u journal buckets", ca->sb.njournal_buckets); |
|---|
| 168 | | - |
|---|
| 192 | + /* |
|---|
| 193 | + * Read journal buckets ordered by golden ratio hash to quickly |
|---|
| 194 | + * find a sequence of buckets with valid journal entries |
|---|
| 195 | + */ |
|---|
| 196 | + for (i = 0; i < ca->sb.njournal_buckets; i++) { |
|---|
| 169 | 197 | /* |
|---|
| 170 | | - * Read journal buckets ordered by golden ratio hash to quickly |
|---|
| 171 | | - * find a sequence of buckets with valid journal entries |
|---|
| 198 | + * We must try the index l with ZERO first for |
|---|
| 199 | + * correctness due to the scenario that the journal |
|---|
| 200 | + * bucket is circular buffer which might have wrapped |
|---|
| 172 | 201 | */ |
|---|
| 173 | | - for (i = 0; i < ca->sb.njournal_buckets; i++) { |
|---|
| 174 | | - /* |
|---|
| 175 | | - * We must try the index l with ZERO first for |
|---|
| 176 | | - * correctness due to the scenario that the journal |
|---|
| 177 | | - * bucket is circular buffer which might have wrapped |
|---|
| 178 | | - */ |
|---|
| 179 | | - l = (i * 2654435769U) % ca->sb.njournal_buckets; |
|---|
| 202 | + l = (i * 2654435769U) % ca->sb.njournal_buckets; |
|---|
| 180 | 203 | |
|---|
| 181 | | - if (test_bit(l, bitmap)) |
|---|
| 182 | | - break; |
|---|
| 204 | + if (test_bit(l, bitmap)) |
|---|
| 205 | + break; |
|---|
| 183 | 206 | |
|---|
| 184 | | - if (read_bucket(l)) |
|---|
| 185 | | - goto bsearch; |
|---|
| 186 | | - } |
|---|
| 187 | | - |
|---|
| 188 | | - /* |
|---|
| 189 | | - * If that fails, check all the buckets we haven't checked |
|---|
| 190 | | - * already |
|---|
| 191 | | - */ |
|---|
| 192 | | - pr_debug("falling back to linear search"); |
|---|
| 193 | | - |
|---|
| 194 | | - for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); |
|---|
| 195 | | - l < ca->sb.njournal_buckets; |
|---|
| 196 | | - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, |
|---|
| 197 | | - l + 1)) |
|---|
| 198 | | - if (read_bucket(l)) |
|---|
| 199 | | - goto bsearch; |
|---|
| 200 | | - |
|---|
| 201 | | - /* no journal entries on this device? */ |
|---|
| 202 | | - if (l == ca->sb.njournal_buckets) |
|---|
| 203 | | - continue; |
|---|
| 204 | | -bsearch: |
|---|
| 205 | | - BUG_ON(list_empty(list)); |
|---|
| 206 | | - |
|---|
| 207 | | - /* Binary search */ |
|---|
| 208 | | - m = l; |
|---|
| 209 | | - r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); |
|---|
| 210 | | - pr_debug("starting binary search, l %u r %u", l, r); |
|---|
| 211 | | - |
|---|
| 212 | | - while (l + 1 < r) { |
|---|
| 213 | | - seq = list_entry(list->prev, struct journal_replay, |
|---|
| 214 | | - list)->j.seq; |
|---|
| 215 | | - |
|---|
| 216 | | - m = (l + r) >> 1; |
|---|
| 217 | | - read_bucket(m); |
|---|
| 218 | | - |
|---|
| 219 | | - if (seq != list_entry(list->prev, struct journal_replay, |
|---|
| 220 | | - list)->j.seq) |
|---|
| 221 | | - l = m; |
|---|
| 222 | | - else |
|---|
| 223 | | - r = m; |
|---|
| 224 | | - } |
|---|
| 225 | | - |
|---|
| 226 | | - /* |
|---|
| 227 | | - * Read buckets in reverse order until we stop finding more |
|---|
| 228 | | - * journal entries |
|---|
| 229 | | - */ |
|---|
| 230 | | - pr_debug("finishing up: m %u njournal_buckets %u", |
|---|
| 231 | | - m, ca->sb.njournal_buckets); |
|---|
| 232 | | - l = m; |
|---|
| 233 | | - |
|---|
| 234 | | - while (1) { |
|---|
| 235 | | - if (!l--) |
|---|
| 236 | | - l = ca->sb.njournal_buckets - 1; |
|---|
| 237 | | - |
|---|
| 238 | | - if (l == m) |
|---|
| 239 | | - break; |
|---|
| 240 | | - |
|---|
| 241 | | - if (test_bit(l, bitmap)) |
|---|
| 242 | | - continue; |
|---|
| 243 | | - |
|---|
| 244 | | - if (!read_bucket(l)) |
|---|
| 245 | | - break; |
|---|
| 246 | | - } |
|---|
| 247 | | - |
|---|
| 248 | | - seq = 0; |
|---|
| 249 | | - |
|---|
| 250 | | - for (i = 0; i < ca->sb.njournal_buckets; i++) |
|---|
| 251 | | - if (ja->seq[i] > seq) { |
|---|
| 252 | | - seq = ja->seq[i]; |
|---|
| 253 | | - /* |
|---|
| 254 | | - * When journal_reclaim() goes to allocate for |
|---|
| 255 | | - * the first time, it'll use the bucket after |
|---|
| 256 | | - * ja->cur_idx |
|---|
| 257 | | - */ |
|---|
| 258 | | - ja->cur_idx = i; |
|---|
| 259 | | - ja->last_idx = ja->discard_idx = (i + 1) % |
|---|
| 260 | | - ca->sb.njournal_buckets; |
|---|
| 261 | | - |
|---|
| 262 | | - } |
|---|
| 207 | + if (read_bucket(l)) |
|---|
| 208 | + goto bsearch; |
|---|
| 263 | 209 | } |
|---|
| 264 | 210 | |
|---|
| 211 | + /* |
|---|
| 212 | + * If that fails, check all the buckets we haven't checked |
|---|
| 213 | + * already |
|---|
| 214 | + */ |
|---|
| 215 | + pr_debug("falling back to linear search\n"); |
|---|
| 216 | + |
|---|
| 217 | + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) |
|---|
| 218 | + if (read_bucket(l)) |
|---|
| 219 | + goto bsearch; |
|---|
| 220 | + |
|---|
| 221 | + /* no journal entries on this device? */ |
|---|
| 222 | + if (l == ca->sb.njournal_buckets) |
|---|
| 223 | + goto out; |
|---|
| 224 | +bsearch: |
|---|
| 225 | + BUG_ON(list_empty(list)); |
|---|
| 226 | + |
|---|
| 227 | + /* Binary search */ |
|---|
| 228 | + m = l; |
|---|
| 229 | + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); |
|---|
| 230 | + pr_debug("starting binary search, l %u r %u\n", l, r); |
|---|
| 231 | + |
|---|
| 232 | + while (l + 1 < r) { |
|---|
| 233 | + seq = list_entry(list->prev, struct journal_replay, |
|---|
| 234 | + list)->j.seq; |
|---|
| 235 | + |
|---|
| 236 | + m = (l + r) >> 1; |
|---|
| 237 | + read_bucket(m); |
|---|
| 238 | + |
|---|
| 239 | + if (seq != list_entry(list->prev, struct journal_replay, |
|---|
| 240 | + list)->j.seq) |
|---|
| 241 | + l = m; |
|---|
| 242 | + else |
|---|
| 243 | + r = m; |
|---|
| 244 | + } |
|---|
| 245 | + |
|---|
| 246 | + /* |
|---|
| 247 | + * Read buckets in reverse order until we stop finding more |
|---|
| 248 | + * journal entries |
|---|
| 249 | + */ |
|---|
| 250 | + pr_debug("finishing up: m %u njournal_buckets %u\n", |
|---|
| 251 | + m, ca->sb.njournal_buckets); |
|---|
| 252 | + l = m; |
|---|
| 253 | + |
|---|
| 254 | + while (1) { |
|---|
| 255 | + if (!l--) |
|---|
| 256 | + l = ca->sb.njournal_buckets - 1; |
|---|
| 257 | + |
|---|
| 258 | + if (l == m) |
|---|
| 259 | + break; |
|---|
| 260 | + |
|---|
| 261 | + if (test_bit(l, bitmap)) |
|---|
| 262 | + continue; |
|---|
| 263 | + |
|---|
| 264 | + if (!read_bucket(l)) |
|---|
| 265 | + break; |
|---|
| 266 | + } |
|---|
| 267 | + |
|---|
| 268 | + seq = 0; |
|---|
| 269 | + |
|---|
| 270 | + for (i = 0; i < ca->sb.njournal_buckets; i++) |
|---|
| 271 | + if (ja->seq[i] > seq) { |
|---|
| 272 | + seq = ja->seq[i]; |
|---|
| 273 | + /* |
|---|
| 274 | + * When journal_reclaim() goes to allocate for |
|---|
| 275 | + * the first time, it'll use the bucket after |
|---|
| 276 | + * ja->cur_idx |
|---|
| 277 | + */ |
|---|
| 278 | + ja->cur_idx = i; |
|---|
| 279 | + ja->last_idx = ja->discard_idx = (i + 1) % |
|---|
| 280 | + ca->sb.njournal_buckets; |
|---|
| 281 | + |
|---|
| 282 | + } |
|---|
| 283 | + |
|---|
| 284 | +out: |
|---|
| 265 | 285 | if (!list_empty(list)) |
|---|
| 266 | 286 | c->journal.seq = list_entry(list->prev, |
|---|
| 267 | 287 | struct journal_replay, |
|---|
| .. | .. |
|---|
| 317 | 337 | } |
|---|
| 318 | 338 | } |
|---|
| 319 | 339 | |
|---|
| 320 | | -bool is_discard_enabled(struct cache_set *s) |
|---|
| 340 | +static bool is_discard_enabled(struct cache_set *s) |
|---|
| 321 | 341 | { |
|---|
| 322 | | - struct cache *ca; |
|---|
| 323 | | - unsigned int i; |
|---|
| 342 | + struct cache *ca = s->cache; |
|---|
| 324 | 343 | |
|---|
| 325 | | - for_each_cache(ca, s, i) |
|---|
| 326 | | - if (ca->discard) |
|---|
| 327 | | - return true; |
|---|
| 344 | + if (ca->discard) |
|---|
| 345 | + return true; |
|---|
| 328 | 346 | |
|---|
| 329 | 347 | return false; |
|---|
| 330 | 348 | } |
|---|
| .. | .. |
|---|
| 344 | 362 | |
|---|
| 345 | 363 | if (n != i->j.seq) { |
|---|
| 346 | 364 | if (n == start && is_discard_enabled(s)) |
|---|
| 347 | | - pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)", |
|---|
| 365 | + pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", |
|---|
| 348 | 366 | n, i->j.seq - 1, start, end); |
|---|
| 349 | 367 | else { |
|---|
| 350 | | - pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", |
|---|
| 368 | + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", |
|---|
| 351 | 369 | n, i->j.seq - 1, start, end); |
|---|
| 352 | 370 | ret = -EIO; |
|---|
| 353 | 371 | goto err; |
|---|
| .. | .. |
|---|
| 377 | 395 | entries++; |
|---|
| 378 | 396 | } |
|---|
| 379 | 397 | |
|---|
| 380 | | - pr_info("journal replay done, %i keys in %i entries, seq %llu", |
|---|
| 398 | + pr_info("journal replay done, %i keys in %i entries, seq %llu\n", |
|---|
| 381 | 399 | keys, entries, end); |
|---|
| 382 | 400 | err: |
|---|
| 383 | 401 | while (!list_empty(list)) { |
|---|
| .. | .. |
|---|
| 389 | 407 | return ret; |
|---|
| 390 | 408 | } |
|---|
| 391 | 409 | |
|---|
| 410 | +void bch_journal_space_reserve(struct journal *j) |
|---|
| 411 | +{ |
|---|
| 412 | + j->do_reserve = true; |
|---|
| 413 | +} |
|---|
| 414 | + |
|---|
| 392 | 415 | /* Journalling */ |
|---|
| 393 | 416 | |
|---|
| 394 | 417 | static void btree_flush_write(struct cache_set *c) |
|---|
| 395 | 418 | { |
|---|
| 396 | | - /* |
|---|
| 397 | | - * Try to find the btree node with that references the oldest journal |
|---|
| 398 | | - * entry, best is our current candidate and is locked if non NULL: |
|---|
| 399 | | - */ |
|---|
| 400 | | - struct btree *b, *best; |
|---|
| 401 | | - unsigned int i; |
|---|
| 419 | + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; |
|---|
| 420 | + unsigned int i, nr; |
|---|
| 421 | + int ref_nr; |
|---|
| 422 | + atomic_t *fifo_front_p, *now_fifo_front_p; |
|---|
| 423 | + size_t mask; |
|---|
| 402 | 424 | |
|---|
| 425 | + if (c->journal.btree_flushing) |
|---|
| 426 | + return; |
|---|
| 427 | + |
|---|
| 428 | + spin_lock(&c->journal.flush_write_lock); |
|---|
| 429 | + if (c->journal.btree_flushing) { |
|---|
| 430 | + spin_unlock(&c->journal.flush_write_lock); |
|---|
| 431 | + return; |
|---|
| 432 | + } |
|---|
| 433 | + c->journal.btree_flushing = true; |
|---|
| 434 | + spin_unlock(&c->journal.flush_write_lock); |
|---|
| 435 | + |
|---|
| 436 | + /* get the oldest journal entry and check its refcount */ |
|---|
| 437 | + spin_lock(&c->journal.lock); |
|---|
| 438 | + fifo_front_p = &fifo_front(&c->journal.pin); |
|---|
| 439 | + ref_nr = atomic_read(fifo_front_p); |
|---|
| 440 | + if (ref_nr <= 0) { |
|---|
| 441 | + /* |
|---|
| 442 | + * do nothing if no btree node references |
|---|
| 443 | + * the oldest journal entry |
|---|
| 444 | + */ |
|---|
| 445 | + spin_unlock(&c->journal.lock); |
|---|
| 446 | + goto out; |
|---|
| 447 | + } |
|---|
| 448 | + spin_unlock(&c->journal.lock); |
|---|
| 449 | + |
|---|
| 450 | + mask = c->journal.pin.mask; |
|---|
| 451 | + nr = 0; |
|---|
| 403 | 452 | atomic_long_inc(&c->flush_write); |
|---|
| 404 | | -retry: |
|---|
| 405 | | - best = NULL; |
|---|
| 453 | + memset(btree_nodes, 0, sizeof(btree_nodes)); |
|---|
| 406 | 454 | |
|---|
| 407 | 455 | mutex_lock(&c->bucket_lock); |
|---|
| 408 | | - for_each_cached_btree(b, c, i) |
|---|
| 409 | | - if (btree_current_write(b)->journal) { |
|---|
| 410 | | - if (!best) |
|---|
| 411 | | - best = b; |
|---|
| 412 | | - else if (journal_pin_cmp(c, |
|---|
| 413 | | - btree_current_write(best)->journal, |
|---|
| 414 | | - btree_current_write(b)->journal)) { |
|---|
| 415 | | - best = b; |
|---|
| 416 | | - } |
|---|
| 456 | + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { |
|---|
| 457 | + /* |
|---|
| 458 | + * It is safe to get now_fifo_front_p without holding |
|---|
| 459 | + * c->journal.lock here, because we don't need to know |
|---|
| 460 | + * the exactly accurate value, just check whether the |
|---|
| 461 | + * front pointer of c->journal.pin is changed. |
|---|
| 462 | + */ |
|---|
| 463 | + now_fifo_front_p = &fifo_front(&c->journal.pin); |
|---|
| 464 | + /* |
|---|
| 465 | + * If the oldest journal entry is reclaimed and front |
|---|
| 466 | + * pointer of c->journal.pin changes, it is unnecessary |
|---|
| 467 | + * to scan c->btree_cache anymore, just quit the loop and |
|---|
| 468 | + * flush out what we have already. |
|---|
| 469 | + */ |
|---|
| 470 | + if (now_fifo_front_p != fifo_front_p) |
|---|
| 471 | + break; |
|---|
| 472 | + /* |
|---|
| 473 | + * quit this loop if all matching btree nodes are |
|---|
| 474 | + * scanned and record in btree_nodes[] already. |
|---|
| 475 | + */ |
|---|
| 476 | + ref_nr = atomic_read(fifo_front_p); |
|---|
| 477 | + if (nr >= ref_nr) |
|---|
| 478 | + break; |
|---|
| 479 | + |
|---|
| 480 | + if (btree_node_journal_flush(b)) |
|---|
| 481 | + pr_err("BUG: flush_write bit should not be set here!\n"); |
|---|
| 482 | + |
|---|
| 483 | + mutex_lock(&b->write_lock); |
|---|
| 484 | + |
|---|
| 485 | + if (!btree_node_dirty(b)) { |
|---|
| 486 | + mutex_unlock(&b->write_lock); |
|---|
| 487 | + continue; |
|---|
| 417 | 488 | } |
|---|
| 418 | 489 | |
|---|
| 419 | | - b = best; |
|---|
| 420 | | - if (b) |
|---|
| 490 | + if (!btree_current_write(b)->journal) { |
|---|
| 491 | + mutex_unlock(&b->write_lock); |
|---|
| 492 | + continue; |
|---|
| 493 | + } |
|---|
| 494 | + |
|---|
| 495 | + /* |
|---|
| 496 | + * Only select the btree node which exactly references |
|---|
| 497 | + * the oldest journal entry. |
|---|
| 498 | + * |
|---|
| 499 | + * If the journal entry pointed by fifo_front_p is |
|---|
| 500 | + * reclaimed in parallel, don't worry: |
|---|
| 501 | + * - the list_for_each_xxx loop will quit when checking |
|---|
| 502 | + * next now_fifo_front_p. |
|---|
| 503 | + * - If there are matched nodes recorded in btree_nodes[], |
|---|
| 504 | + * they are clean now (this is why and how the oldest |
|---|
| 505 | + * journal entry can be reclaimed). These selected nodes |
|---|
| 506 | + * will be ignored and skipped in the folowing for-loop. |
|---|
| 507 | + */ |
|---|
| 508 | + if (((btree_current_write(b)->journal - fifo_front_p) & |
|---|
| 509 | + mask) != 0) { |
|---|
| 510 | + mutex_unlock(&b->write_lock); |
|---|
| 511 | + continue; |
|---|
| 512 | + } |
|---|
| 513 | + |
|---|
| 421 | 514 | set_btree_node_journal_flush(b); |
|---|
| 515 | + |
|---|
| 516 | + mutex_unlock(&b->write_lock); |
|---|
| 517 | + |
|---|
| 518 | + btree_nodes[nr++] = b; |
|---|
| 519 | + /* |
|---|
| 520 | + * To avoid holding c->bucket_lock too long time, |
|---|
| 521 | + * only scan for BTREE_FLUSH_NR matched btree nodes |
|---|
| 522 | + * at most. If there are more btree nodes reference |
|---|
| 523 | + * the oldest journal entry, try to flush them next |
|---|
| 524 | + * time when btree_flush_write() is called. |
|---|
| 525 | + */ |
|---|
| 526 | + if (nr == BTREE_FLUSH_NR) |
|---|
| 527 | + break; |
|---|
| 528 | + } |
|---|
| 422 | 529 | mutex_unlock(&c->bucket_lock); |
|---|
| 423 | 530 | |
|---|
| 424 | | - if (b) { |
|---|
| 531 | + for (i = 0; i < nr; i++) { |
|---|
| 532 | + b = btree_nodes[i]; |
|---|
| 533 | + if (!b) { |
|---|
| 534 | + pr_err("BUG: btree_nodes[%d] is NULL\n", i); |
|---|
| 535 | + continue; |
|---|
| 536 | + } |
|---|
| 537 | + |
|---|
| 538 | + /* safe to check without holding b->write_lock */ |
|---|
| 539 | + if (!btree_node_journal_flush(b)) { |
|---|
| 540 | + pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b); |
|---|
| 541 | + continue; |
|---|
| 542 | + } |
|---|
| 543 | + |
|---|
| 425 | 544 | mutex_lock(&b->write_lock); |
|---|
| 426 | 545 | if (!btree_current_write(b)->journal) { |
|---|
| 427 | 546 | clear_bit(BTREE_NODE_journal_flush, &b->flags); |
|---|
| 428 | 547 | mutex_unlock(&b->write_lock); |
|---|
| 429 | | - /* We raced */ |
|---|
| 430 | | - atomic_long_inc(&c->retry_flush_write); |
|---|
| 431 | | - goto retry; |
|---|
| 548 | + pr_debug("bnode %p: written by others\n", b); |
|---|
| 549 | + continue; |
|---|
| 550 | + } |
|---|
| 551 | + |
|---|
| 552 | + if (!btree_node_dirty(b)) { |
|---|
| 553 | + clear_bit(BTREE_NODE_journal_flush, &b->flags); |
|---|
| 554 | + mutex_unlock(&b->write_lock); |
|---|
| 555 | + pr_debug("bnode %p: dirty bit cleaned by others\n", b); |
|---|
| 556 | + continue; |
|---|
| 432 | 557 | } |
|---|
| 433 | 558 | |
|---|
| 434 | 559 | __bch_btree_node_write(b, NULL); |
|---|
| 435 | 560 | clear_bit(BTREE_NODE_journal_flush, &b->flags); |
|---|
| 436 | 561 | mutex_unlock(&b->write_lock); |
|---|
| 437 | 562 | } |
|---|
| 563 | + |
|---|
| 564 | +out: |
|---|
| 565 | + spin_lock(&c->journal.flush_write_lock); |
|---|
| 566 | + c->journal.btree_flushing = false; |
|---|
| 567 | + spin_unlock(&c->journal.flush_write_lock); |
|---|
| 438 | 568 | } |
|---|
| 439 | 569 | |
|---|
| 440 | 570 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) |
|---|
| .. | .. |
|---|
| 478 | 608 | ca->sb.njournal_buckets; |
|---|
| 479 | 609 | |
|---|
| 480 | 610 | atomic_set(&ja->discard_in_flight, DISCARD_READY); |
|---|
| 481 | | - /* fallthrough */ |
|---|
| 611 | + fallthrough; |
|---|
| 482 | 612 | |
|---|
| 483 | 613 | case DISCARD_READY: |
|---|
| 484 | 614 | if (ja->discard_idx == ja->last_idx) |
|---|
| .. | .. |
|---|
| 500 | 630 | } |
|---|
| 501 | 631 | } |
|---|
| 502 | 632 | |
|---|
| 633 | +static unsigned int free_journal_buckets(struct cache_set *c) |
|---|
| 634 | +{ |
|---|
| 635 | + struct journal *j = &c->journal; |
|---|
| 636 | + struct cache *ca = c->cache; |
|---|
| 637 | + struct journal_device *ja = &c->cache->journal; |
|---|
| 638 | + unsigned int n; |
|---|
| 639 | + |
|---|
| 640 | + /* In case njournal_buckets is not power of 2 */ |
|---|
| 641 | + if (ja->cur_idx >= ja->discard_idx) |
|---|
| 642 | + n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; |
|---|
| 643 | + else |
|---|
| 644 | + n = ja->discard_idx - ja->cur_idx; |
|---|
| 645 | + |
|---|
| 646 | + if (n > (1 + j->do_reserve)) |
|---|
| 647 | + return n - (1 + j->do_reserve); |
|---|
| 648 | + |
|---|
| 649 | + return 0; |
|---|
| 650 | +} |
|---|
| 651 | + |
|---|
| 503 | 652 | static void journal_reclaim(struct cache_set *c) |
|---|
| 504 | 653 | { |
|---|
| 505 | 654 | struct bkey *k = &c->journal.key; |
|---|
| 506 | | - struct cache *ca; |
|---|
| 655 | + struct cache *ca = c->cache; |
|---|
| 507 | 656 | uint64_t last_seq; |
|---|
| 508 | | - unsigned int iter, n = 0; |
|---|
| 657 | + struct journal_device *ja = &ca->journal; |
|---|
| 509 | 658 | atomic_t p __maybe_unused; |
|---|
| 510 | 659 | |
|---|
| 511 | 660 | atomic_long_inc(&c->reclaim); |
|---|
| .. | .. |
|---|
| 517 | 666 | |
|---|
| 518 | 667 | /* Update last_idx */ |
|---|
| 519 | 668 | |
|---|
| 520 | | - for_each_cache(ca, c, iter) { |
|---|
| 521 | | - struct journal_device *ja = &ca->journal; |
|---|
| 669 | + while (ja->last_idx != ja->cur_idx && |
|---|
| 670 | + ja->seq[ja->last_idx] < last_seq) |
|---|
| 671 | + ja->last_idx = (ja->last_idx + 1) % |
|---|
| 672 | + ca->sb.njournal_buckets; |
|---|
| 522 | 673 | |
|---|
| 523 | | - while (ja->last_idx != ja->cur_idx && |
|---|
| 524 | | - ja->seq[ja->last_idx] < last_seq) |
|---|
| 525 | | - ja->last_idx = (ja->last_idx + 1) % |
|---|
| 526 | | - ca->sb.njournal_buckets; |
|---|
| 527 | | - } |
|---|
| 528 | | - |
|---|
| 529 | | - for_each_cache(ca, c, iter) |
|---|
| 530 | | - do_journal_discard(ca); |
|---|
| 674 | + do_journal_discard(ca); |
|---|
| 531 | 675 | |
|---|
| 532 | 676 | if (c->journal.blocks_free) |
|---|
| 533 | 677 | goto out; |
|---|
| 534 | 678 | |
|---|
| 535 | | - /* |
|---|
| 536 | | - * Allocate: |
|---|
| 537 | | - * XXX: Sort by free journal space |
|---|
| 538 | | - */ |
|---|
| 679 | + if (!free_journal_buckets(c)) |
|---|
| 680 | + goto out; |
|---|
| 539 | 681 | |
|---|
| 540 | | - for_each_cache(ca, c, iter) { |
|---|
| 541 | | - struct journal_device *ja = &ca->journal; |
|---|
| 542 | | - unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; |
|---|
| 682 | + ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; |
|---|
| 683 | + k->ptr[0] = MAKE_PTR(0, |
|---|
| 684 | + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), |
|---|
| 685 | + ca->sb.nr_this_dev); |
|---|
| 686 | + atomic_long_inc(&c->reclaimed_journal_buckets); |
|---|
| 543 | 687 | |
|---|
| 544 | | - /* No space available on this device */ |
|---|
| 545 | | - if (next == ja->discard_idx) |
|---|
| 546 | | - continue; |
|---|
| 688 | + bkey_init(k); |
|---|
| 689 | + SET_KEY_PTRS(k, 1); |
|---|
| 690 | + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; |
|---|
| 547 | 691 | |
|---|
| 548 | | - ja->cur_idx = next; |
|---|
| 549 | | - k->ptr[n++] = MAKE_PTR(0, |
|---|
| 550 | | - bucket_to_sector(c, ca->sb.d[ja->cur_idx]), |
|---|
| 551 | | - ca->sb.nr_this_dev); |
|---|
| 552 | | - } |
|---|
| 553 | | - |
|---|
| 554 | | - if (n) { |
|---|
| 555 | | - bkey_init(k); |
|---|
| 556 | | - SET_KEY_PTRS(k, n); |
|---|
| 557 | | - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; |
|---|
| 558 | | - } |
|---|
| 559 | 692 | out: |
|---|
| 560 | 693 | if (!journal_full(&c->journal)) |
|---|
| 561 | 694 | __closure_wake_up(&c->journal.wait); |
|---|
| .. | .. |
|---|
| 582 | 715 | j->cur->data->keys = 0; |
|---|
| 583 | 716 | |
|---|
| 584 | 717 | if (fifo_full(&j->pin)) |
|---|
| 585 | | - pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); |
|---|
| 718 | + pr_debug("journal_pin full (%zu)\n", fifo_used(&j->pin)); |
|---|
| 586 | 719 | } |
|---|
| 587 | 720 | |
|---|
| 588 | 721 | static void journal_write_endio(struct bio *bio) |
|---|
| .. | .. |
|---|
| 619 | 752 | __releases(c->journal.lock) |
|---|
| 620 | 753 | { |
|---|
| 621 | 754 | struct cache_set *c = container_of(cl, struct cache_set, journal.io); |
|---|
| 622 | | - struct cache *ca; |
|---|
| 755 | + struct cache *ca = c->cache; |
|---|
| 623 | 756 | struct journal_write *w = c->journal.cur; |
|---|
| 624 | 757 | struct bkey *k = &c->journal.key; |
|---|
| 625 | | - unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * |
|---|
| 626 | | - c->sb.block_size; |
|---|
| 758 | + unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * |
|---|
| 759 | + ca->sb.block_size; |
|---|
| 627 | 760 | |
|---|
| 628 | 761 | struct bio *bio; |
|---|
| 629 | 762 | struct bio_list list; |
|---|
| .. | .. |
|---|
| 642 | 775 | return; |
|---|
| 643 | 776 | } |
|---|
| 644 | 777 | |
|---|
| 645 | | - c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); |
|---|
| 778 | + c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); |
|---|
| 646 | 779 | |
|---|
| 647 | 780 | w->data->btree_level = c->root->level; |
|---|
| 648 | 781 | |
|---|
| 649 | 782 | bkey_copy(&w->data->btree_root, &c->root->key); |
|---|
| 650 | 783 | bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); |
|---|
| 651 | 784 | |
|---|
| 652 | | - for_each_cache(ca, c, i) |
|---|
| 653 | | - w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; |
|---|
| 654 | | - |
|---|
| 655 | | - w->data->magic = jset_magic(&c->sb); |
|---|
| 785 | + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; |
|---|
| 786 | + w->data->magic = jset_magic(&ca->sb); |
|---|
| 656 | 787 | w->data->version = BCACHE_JSET_VERSION; |
|---|
| 657 | 788 | w->data->last_seq = last_seq(&c->journal); |
|---|
| 658 | 789 | w->data->csum = csum_set(w->data); |
|---|
| .. | .. |
|---|
| 674 | 805 | REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); |
|---|
| 675 | 806 | bch_bio_map(bio, w->data); |
|---|
| 676 | 807 | |
|---|
| 677 | | - trace_bcache_journal_write(bio); |
|---|
| 808 | + trace_bcache_journal_write(bio, w->data->keys); |
|---|
| 678 | 809 | bio_list_add(&list, bio); |
|---|
| 679 | 810 | |
|---|
| 680 | 811 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); |
|---|
| .. | .. |
|---|
| 728 | 859 | size_t sectors; |
|---|
| 729 | 860 | struct closure cl; |
|---|
| 730 | 861 | bool wait = false; |
|---|
| 862 | + struct cache *ca = c->cache; |
|---|
| 731 | 863 | |
|---|
| 732 | 864 | closure_init_stack(&cl); |
|---|
| 733 | 865 | |
|---|
| .. | .. |
|---|
| 737 | 869 | struct journal_write *w = c->journal.cur; |
|---|
| 738 | 870 | |
|---|
| 739 | 871 | sectors = __set_blocks(w->data, w->data->keys + nkeys, |
|---|
| 740 | | - block_bytes(c)) * c->sb.block_size; |
|---|
| 872 | + block_bytes(ca)) * ca->sb.block_size; |
|---|
| 741 | 873 | |
|---|
| 742 | 874 | if (sectors <= min_t(size_t, |
|---|
| 743 | | - c->journal.blocks_free * c->sb.block_size, |
|---|
| 875 | + c->journal.blocks_free * ca->sb.block_size, |
|---|
| 744 | 876 | PAGE_SECTORS << JSET_BITS)) |
|---|
| 745 | 877 | return w; |
|---|
| 746 | 878 | |
|---|
| .. | .. |
|---|
| 805 | 937 | if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) |
|---|
| 806 | 938 | return NULL; |
|---|
| 807 | 939 | |
|---|
| 808 | | - if (!CACHE_SYNC(&c->sb)) |
|---|
| 940 | + if (!CACHE_SYNC(&c->cache->sb)) |
|---|
| 809 | 941 | return NULL; |
|---|
| 810 | 942 | |
|---|
| 811 | 943 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); |
|---|
| .. | .. |
|---|
| 821 | 953 | journal_try_write(c); |
|---|
| 822 | 954 | } else if (!w->dirty) { |
|---|
| 823 | 955 | w->dirty = true; |
|---|
| 824 | | - schedule_delayed_work(&c->journal.work, |
|---|
| 825 | | - msecs_to_jiffies(c->journal_delay_ms)); |
|---|
| 956 | + queue_delayed_work(bch_flush_wq, &c->journal.work, |
|---|
| 957 | + msecs_to_jiffies(c->journal_delay_ms)); |
|---|
| 826 | 958 | spin_unlock(&c->journal.lock); |
|---|
| 827 | 959 | } else { |
|---|
| 828 | 960 | spin_unlock(&c->journal.lock); |
|---|
| .. | .. |
|---|
| 856 | 988 | struct journal *j = &c->journal; |
|---|
| 857 | 989 | |
|---|
| 858 | 990 | spin_lock_init(&j->lock); |
|---|
| 991 | + spin_lock_init(&j->flush_write_lock); |
|---|
| 859 | 992 | INIT_DELAYED_WORK(&j->work, journal_write_work); |
|---|
| 860 | 993 | |
|---|
| 861 | 994 | c->journal_delay_ms = 100; |
|---|