.. | .. |
---|
20 | 20 | |
---|
21 | 21 | #define HIGH_WATERMARK 50 |
---|
22 | 22 | #define LOW_WATERMARK 45 |
---|
23 | | -#define MAX_WRITEBACK_JOBS 0 |
---|
| 23 | +#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) |
---|
24 | 24 | #define ENDIO_LATENCY 16 |
---|
25 | 25 | #define WRITEBACK_LATENCY 64 |
---|
26 | 26 | #define AUTOCOMMIT_BLOCKS_SSD 65536 |
---|
27 | 27 | #define AUTOCOMMIT_BLOCKS_PMEM 64 |
---|
28 | 28 | #define AUTOCOMMIT_MSEC 1000 |
---|
| 29 | +#define MAX_AGE_DIV 16 |
---|
| 30 | +#define MAX_AGE_UNSPECIFIED -1UL |
---|
29 | 31 | |
---|
30 | 32 | #define BITMAP_GRANULARITY 65536 |
---|
31 | 33 | #if BITMAP_GRANULARITY < PAGE_SIZE |
---|
.. | .. |
---|
47 | 49 | #define pmem_assign(dest, src) ((dest) = (src)) |
---|
48 | 50 | #endif |
---|
49 | 51 | |
---|
50 | | -#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM) |
---|
| 52 | +#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) |
---|
51 | 53 | #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS |
---|
52 | 54 | #endif |
---|
53 | 55 | |
---|
.. | .. |
---|
88 | 90 | :47 |
---|
89 | 91 | #endif |
---|
90 | 92 | ; |
---|
| 93 | + unsigned long age; |
---|
91 | 94 | #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS |
---|
92 | 95 | uint64_t original_sector; |
---|
93 | 96 | uint64_t seq_count; |
---|
.. | .. |
---|
119 | 122 | size_t writeback_size; |
---|
120 | 123 | size_t freelist_high_watermark; |
---|
121 | 124 | size_t freelist_low_watermark; |
---|
| 125 | + unsigned long max_age; |
---|
122 | 126 | |
---|
123 | 127 | unsigned uncommitted_blocks; |
---|
124 | 128 | unsigned autocommit_blocks; |
---|
.. | .. |
---|
129 | 133 | unsigned long autocommit_jiffies; |
---|
130 | 134 | struct timer_list autocommit_timer; |
---|
131 | 135 | struct wait_queue_head freelist_wait; |
---|
| 136 | + |
---|
| 137 | + struct timer_list max_age_timer; |
---|
132 | 138 | |
---|
133 | 139 | atomic_t bio_in_progress[2]; |
---|
134 | 140 | struct wait_queue_head bio_in_progress_wait[2]; |
---|
.. | .. |
---|
160 | 166 | bool max_writeback_jobs_set:1; |
---|
161 | 167 | bool autocommit_blocks_set:1; |
---|
162 | 168 | bool autocommit_time_set:1; |
---|
| 169 | + bool max_age_set:1; |
---|
163 | 170 | bool writeback_fua_set:1; |
---|
164 | 171 | bool flush_on_suspend:1; |
---|
| 172 | + bool cleaner:1; |
---|
| 173 | + bool cleaner_set:1; |
---|
165 | 174 | |
---|
166 | 175 | unsigned high_wm_percent_value; |
---|
167 | 176 | unsigned low_wm_percent_value; |
---|
168 | 177 | unsigned autocommit_time_value; |
---|
| 178 | + unsigned max_age_value; |
---|
169 | 179 | |
---|
170 | 180 | unsigned writeback_all; |
---|
171 | 181 | struct workqueue_struct *writeback_wq; |
---|
.. | .. |
---|
196 | 206 | struct dm_writecache *wc; |
---|
197 | 207 | struct wc_entry **wc_list; |
---|
198 | 208 | unsigned wc_list_n; |
---|
199 | | - unsigned page_offset; |
---|
200 | | - struct page *page; |
---|
201 | 209 | struct wc_entry *wc_list_inline[WB_LIST_INLINE]; |
---|
202 | 210 | struct bio bio; |
---|
203 | 211 | }; |
---|
.. | .. |
---|
236 | 244 | |
---|
237 | 245 | wc->memory_vmapped = false; |
---|
238 | 246 | |
---|
239 | | - if (!wc->ssd_dev->dax_dev) { |
---|
240 | | - r = -EOPNOTSUPP; |
---|
241 | | - goto err1; |
---|
242 | | - } |
---|
243 | 247 | s = wc->memory_map_size; |
---|
244 | 248 | p = s >> PAGE_SHIFT; |
---|
245 | 249 | if (!p) { |
---|
.. | .. |
---|
366 | 370 | |
---|
367 | 371 | static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) |
---|
368 | 372 | { |
---|
369 | | - if (is_power_of_2(sizeof(struct wc_entry)) && 0) |
---|
370 | | - return &sb(wc)->entries[e - wc->entries]; |
---|
371 | | - else |
---|
372 | | - return &sb(wc)->entries[e->index]; |
---|
| 373 | + return &sb(wc)->entries[e->index]; |
---|
373 | 374 | } |
---|
374 | 375 | |
---|
375 | 376 | static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) |
---|
.. | .. |
---|
523 | 524 | memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); |
---|
524 | 525 | } |
---|
525 | 526 | |
---|
| 527 | +static void ssd_commit_superblock(struct dm_writecache *wc) |
---|
| 528 | +{ |
---|
| 529 | + int r; |
---|
| 530 | + struct dm_io_region region; |
---|
| 531 | + struct dm_io_request req; |
---|
| 532 | + |
---|
| 533 | + region.bdev = wc->ssd_dev->bdev; |
---|
| 534 | + region.sector = 0; |
---|
| 535 | + region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; |
---|
| 536 | + |
---|
| 537 | + if (unlikely(region.sector + region.count > wc->metadata_sectors)) |
---|
| 538 | + region.count = wc->metadata_sectors - region.sector; |
---|
| 539 | + |
---|
| 540 | + region.sector += wc->start_sector; |
---|
| 541 | + |
---|
| 542 | + req.bi_op = REQ_OP_WRITE; |
---|
| 543 | + req.bi_op_flags = REQ_SYNC | REQ_FUA; |
---|
| 544 | + req.mem.type = DM_IO_VMA; |
---|
| 545 | + req.mem.ptr.vma = (char *)wc->memory_map; |
---|
| 546 | + req.client = wc->dm_io; |
---|
| 547 | + req.notify.fn = NULL; |
---|
| 548 | + req.notify.context = NULL; |
---|
| 549 | + |
---|
| 550 | + r = dm_io(&req, 1, ®ion, NULL); |
---|
| 551 | + if (unlikely(r)) |
---|
| 552 | + writecache_error(wc, r, "error writing superblock"); |
---|
| 553 | +} |
---|
| 554 | + |
---|
526 | 555 | static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) |
---|
527 | 556 | { |
---|
528 | 557 | if (WC_MODE_PMEM(wc)) |
---|
529 | | - wmb(); |
---|
| 558 | + pmem_wmb(); |
---|
530 | 559 | else |
---|
531 | 560 | ssd_commit_flushed(wc, wait_for_ios); |
---|
532 | 561 | } |
---|
.. | .. |
---|
568 | 597 | e = container_of(node, struct wc_entry, rb_node); |
---|
569 | 598 | if (read_original_sector(wc, e) == block) |
---|
570 | 599 | break; |
---|
| 600 | + |
---|
571 | 601 | node = (read_original_sector(wc, e) >= block ? |
---|
572 | 602 | e->rb_node.rb_left : e->rb_node.rb_right); |
---|
573 | 603 | if (unlikely(!node)) { |
---|
574 | | - if (!(flags & WFE_RETURN_FOLLOWING)) { |
---|
| 604 | + if (!(flags & WFE_RETURN_FOLLOWING)) |
---|
575 | 605 | return NULL; |
---|
576 | | - } |
---|
577 | 606 | if (read_original_sector(wc, e) >= block) { |
---|
578 | | - break; |
---|
| 607 | + return e; |
---|
579 | 608 | } else { |
---|
580 | 609 | node = rb_next(&e->rb_node); |
---|
581 | | - if (unlikely(!node)) { |
---|
| 610 | + if (unlikely(!node)) |
---|
582 | 611 | return NULL; |
---|
583 | | - } |
---|
584 | 612 | e = container_of(node, struct wc_entry, rb_node); |
---|
585 | | - break; |
---|
| 613 | + return e; |
---|
586 | 614 | } |
---|
587 | 615 | } |
---|
588 | 616 | } |
---|
.. | .. |
---|
593 | 621 | node = rb_prev(&e->rb_node); |
---|
594 | 622 | else |
---|
595 | 623 | node = rb_next(&e->rb_node); |
---|
596 | | - if (!node) |
---|
| 624 | + if (unlikely(!node)) |
---|
597 | 625 | return e; |
---|
598 | 626 | e2 = container_of(node, struct wc_entry, rb_node); |
---|
599 | 627 | if (read_original_sector(wc, e2) != block) |
---|
.. | .. |
---|
618 | 646 | rb_link_node(&ins->rb_node, parent, node); |
---|
619 | 647 | rb_insert_color(&ins->rb_node, &wc->tree); |
---|
620 | 648 | list_add(&ins->lru, &wc->lru); |
---|
| 649 | + ins->age = jiffies; |
---|
621 | 650 | } |
---|
622 | 651 | |
---|
623 | 652 | static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) |
---|
.. | .. |
---|
653 | 682 | queue_work(wc->writeback_wq, &wc->writeback_work); |
---|
654 | 683 | } |
---|
655 | 684 | |
---|
656 | | -static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) |
---|
| 685 | +static void writecache_max_age_timer(struct timer_list *t) |
---|
| 686 | +{ |
---|
| 687 | + struct dm_writecache *wc = from_timer(wc, t, max_age_timer); |
---|
| 688 | + |
---|
| 689 | + if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { |
---|
| 690 | + queue_work(wc->writeback_wq, &wc->writeback_work); |
---|
| 691 | + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); |
---|
| 692 | + } |
---|
| 693 | +} |
---|
| 694 | + |
---|
| 695 | +static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) |
---|
657 | 696 | { |
---|
658 | 697 | struct wc_entry *e; |
---|
659 | 698 | |
---|
.. | .. |
---|
662 | 701 | if (unlikely(!wc->current_free)) |
---|
663 | 702 | return NULL; |
---|
664 | 703 | e = wc->current_free; |
---|
| 704 | + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) |
---|
| 705 | + return NULL; |
---|
665 | 706 | next = rb_next(&e->rb_node); |
---|
666 | 707 | rb_erase(&e->rb_node, &wc->freetree); |
---|
667 | 708 | if (unlikely(!next)) |
---|
.. | .. |
---|
671 | 712 | if (unlikely(list_empty(&wc->freelist))) |
---|
672 | 713 | return NULL; |
---|
673 | 714 | e = container_of(wc->freelist.next, struct wc_entry, lru); |
---|
| 715 | + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) |
---|
| 716 | + return NULL; |
---|
674 | 717 | list_del(&e->lru); |
---|
675 | 718 | } |
---|
676 | 719 | wc->freelist_size--; |
---|
.. | .. |
---|
759 | 802 | |
---|
760 | 803 | wc->seq_count++; |
---|
761 | 804 | pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); |
---|
762 | | - writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); |
---|
763 | | - writecache_commit_flushed(wc, false); |
---|
| 805 | + if (WC_MODE_PMEM(wc)) |
---|
| 806 | + writecache_commit_flushed(wc, false); |
---|
| 807 | + else |
---|
| 808 | + ssd_commit_superblock(wc); |
---|
764 | 809 | |
---|
765 | 810 | wc->overwrote_committed = false; |
---|
766 | 811 | |
---|
.. | .. |
---|
823 | 868 | |
---|
824 | 869 | if (likely(!e->write_in_progress)) { |
---|
825 | 870 | if (!discarded_something) { |
---|
826 | | - writecache_wait_for_ios(wc, READ); |
---|
827 | | - writecache_wait_for_ios(wc, WRITE); |
---|
| 871 | + if (!WC_MODE_PMEM(wc)) { |
---|
| 872 | + writecache_wait_for_ios(wc, READ); |
---|
| 873 | + writecache_wait_for_ios(wc, WRITE); |
---|
| 874 | + } |
---|
828 | 875 | discarded_something = true; |
---|
829 | 876 | } |
---|
830 | 877 | if (!writecache_entry_is_committed(wc, e)) |
---|
.. | .. |
---|
832 | 879 | writecache_free_entry(wc, e); |
---|
833 | 880 | } |
---|
834 | 881 | |
---|
835 | | - if (!node) |
---|
| 882 | + if (unlikely(!node)) |
---|
836 | 883 | break; |
---|
837 | 884 | |
---|
838 | 885 | e = container_of(node, struct wc_entry, rb_node); |
---|
.. | .. |
---|
857 | 904 | bool flush_on_suspend; |
---|
858 | 905 | |
---|
859 | 906 | del_timer_sync(&wc->autocommit_timer); |
---|
| 907 | + del_timer_sync(&wc->max_age_timer); |
---|
860 | 908 | |
---|
861 | 909 | wc_lock(wc); |
---|
862 | 910 | writecache_flush(wc); |
---|
.. | .. |
---|
955 | 1003 | } |
---|
956 | 1004 | wc->freelist_size = 0; |
---|
957 | 1005 | |
---|
958 | | - r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t)); |
---|
| 1006 | + r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, |
---|
| 1007 | + sizeof(uint64_t)); |
---|
959 | 1008 | if (r) { |
---|
960 | 1009 | writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); |
---|
961 | 1010 | sb_seq_count = cpu_to_le64(0); |
---|
.. | .. |
---|
971 | 1020 | e->seq_count = -1; |
---|
972 | 1021 | continue; |
---|
973 | 1022 | } |
---|
974 | | - r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry)); |
---|
| 1023 | + r = copy_mc_to_kernel(&wme, memory_entry(wc, e), |
---|
| 1024 | + sizeof(struct wc_memory_entry)); |
---|
975 | 1025 | if (r) { |
---|
976 | 1026 | writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", |
---|
977 | 1027 | (unsigned long)b, r); |
---|
.. | .. |
---|
1025 | 1075 | |
---|
1026 | 1076 | writecache_verify_watermark(wc); |
---|
1027 | 1077 | |
---|
| 1078 | + if (wc->max_age != MAX_AGE_UNSPECIFIED) |
---|
| 1079 | + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); |
---|
| 1080 | + |
---|
1028 | 1081 | wc_unlock(wc); |
---|
1029 | 1082 | } |
---|
1030 | 1083 | |
---|
.. | .. |
---|
1073 | 1126 | return 0; |
---|
1074 | 1127 | } |
---|
1075 | 1128 | |
---|
| 1129 | +static void activate_cleaner(struct dm_writecache *wc) |
---|
| 1130 | +{ |
---|
| 1131 | + wc->flush_on_suspend = true; |
---|
| 1132 | + wc->cleaner = true; |
---|
| 1133 | + wc->freelist_high_watermark = wc->n_blocks; |
---|
| 1134 | + wc->freelist_low_watermark = wc->n_blocks; |
---|
| 1135 | +} |
---|
| 1136 | + |
---|
| 1137 | +static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) |
---|
| 1138 | +{ |
---|
| 1139 | + if (argc != 1) |
---|
| 1140 | + return -EINVAL; |
---|
| 1141 | + |
---|
| 1142 | + wc_lock(wc); |
---|
| 1143 | + activate_cleaner(wc); |
---|
| 1144 | + if (!dm_suspended(wc->ti)) |
---|
| 1145 | + writecache_verify_watermark(wc); |
---|
| 1146 | + wc_unlock(wc); |
---|
| 1147 | + |
---|
| 1148 | + return 0; |
---|
| 1149 | +} |
---|
| 1150 | + |
---|
1076 | 1151 | static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, |
---|
1077 | 1152 | char *result, unsigned maxlen) |
---|
1078 | 1153 | { |
---|
.. | .. |
---|
1083 | 1158 | r = process_flush_mesg(argc, argv, wc); |
---|
1084 | 1159 | else if (!strcasecmp(argv[0], "flush_on_suspend")) |
---|
1085 | 1160 | r = process_flush_on_suspend_mesg(argc, argv, wc); |
---|
| 1161 | + else if (!strcasecmp(argv[0], "cleaner")) |
---|
| 1162 | + r = process_cleaner_mesg(argc, argv, wc); |
---|
1086 | 1163 | else |
---|
1087 | 1164 | DMERR("unrecognised message received: %s", argv[0]); |
---|
1088 | 1165 | |
---|
1089 | 1166 | return r; |
---|
| 1167 | +} |
---|
| 1168 | + |
---|
| 1169 | +static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) |
---|
| 1170 | +{ |
---|
| 1171 | + /* |
---|
| 1172 | + * clflushopt performs better with block size 1024, 2048, 4096 |
---|
| 1173 | + * non-temporal stores perform better with block size 512 |
---|
| 1174 | + * |
---|
| 1175 | + * block size 512 1024 2048 4096 |
---|
| 1176 | + * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s |
---|
| 1177 | + * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s |
---|
| 1178 | + * |
---|
| 1179 | + * We see that movnti performs better for 512-byte blocks, and |
---|
| 1180 | + * clflushopt performs better for 1024-byte and larger blocks. So, we |
---|
| 1181 | + * prefer clflushopt for sizes >= 768. |
---|
| 1182 | + * |
---|
| 1183 | + * NOTE: this happens to be the case now (with dm-writecache's single |
---|
| 1184 | + * threaded model) but re-evaluate this once memcpy_flushcache() is |
---|
| 1185 | + * enabled to use movdir64b which might invalidate this performance |
---|
| 1186 | + * advantage seen with cache-allocating-writes plus flushing. |
---|
| 1187 | + */ |
---|
| 1188 | +#ifdef CONFIG_X86 |
---|
| 1189 | + if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && |
---|
| 1190 | + likely(boot_cpu_data.x86_clflush_size == 64) && |
---|
| 1191 | + likely(size >= 768)) { |
---|
| 1192 | + do { |
---|
| 1193 | + memcpy((void *)dest, (void *)source, 64); |
---|
| 1194 | + clflushopt((void *)dest); |
---|
| 1195 | + dest += 64; |
---|
| 1196 | + source += 64; |
---|
| 1197 | + size -= 64; |
---|
| 1198 | + } while (size >= 64); |
---|
| 1199 | + return; |
---|
| 1200 | + } |
---|
| 1201 | +#endif |
---|
| 1202 | + memcpy_flushcache(dest, source, size); |
---|
1090 | 1203 | } |
---|
1091 | 1204 | |
---|
1092 | 1205 | static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) |
---|
.. | .. |
---|
1106 | 1219 | |
---|
1107 | 1220 | if (rw == READ) { |
---|
1108 | 1221 | int r; |
---|
1109 | | - r = memcpy_mcsafe(buf, data, size); |
---|
| 1222 | + r = copy_mc_to_kernel(buf, data, size); |
---|
1110 | 1223 | flush_dcache_page(bio_page(bio)); |
---|
1111 | 1224 | if (unlikely(r)) { |
---|
1112 | 1225 | writecache_error(wc, r, "hardware memory error when reading data: %d", r); |
---|
.. | .. |
---|
1114 | 1227 | } |
---|
1115 | 1228 | } else { |
---|
1116 | 1229 | flush_dcache_page(bio_page(bio)); |
---|
1117 | | - memcpy_flushcache(data, buf, size); |
---|
| 1230 | + memcpy_flushcache_optimized(data, buf, size); |
---|
1118 | 1231 | } |
---|
1119 | 1232 | |
---|
1120 | 1233 | bvec_kunmap_irq(buf, &flags); |
---|
.. | .. |
---|
1152 | 1265 | bio_end_sector(bio)); |
---|
1153 | 1266 | wc_unlock(wc); |
---|
1154 | 1267 | bio_set_dev(bio, wc->dev->bdev); |
---|
1155 | | - generic_make_request(bio); |
---|
| 1268 | + submit_bio_noacct(bio); |
---|
1156 | 1269 | } else { |
---|
1157 | 1270 | writecache_flush(wc); |
---|
1158 | 1271 | wc_unlock(wc); |
---|
.. | .. |
---|
1188 | 1301 | writecache_flush(wc); |
---|
1189 | 1302 | if (writecache_has_error(wc)) |
---|
1190 | 1303 | goto unlock_error; |
---|
| 1304 | + if (unlikely(wc->cleaner)) |
---|
| 1305 | + goto unlock_remap_origin; |
---|
1191 | 1306 | goto unlock_submit; |
---|
1192 | 1307 | } else { |
---|
| 1308 | + if (dm_bio_get_target_bio_nr(bio)) |
---|
| 1309 | + goto unlock_remap_origin; |
---|
1193 | 1310 | writecache_offload_bio(wc, bio); |
---|
1194 | 1311 | goto unlock_return; |
---|
1195 | 1312 | } |
---|
.. | .. |
---|
1246 | 1363 | } |
---|
1247 | 1364 | } else { |
---|
1248 | 1365 | do { |
---|
| 1366 | + bool found_entry = false; |
---|
| 1367 | + bool search_used = false; |
---|
1249 | 1368 | if (writecache_has_error(wc)) |
---|
1250 | 1369 | goto unlock_error; |
---|
1251 | 1370 | e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); |
---|
1252 | 1371 | if (e) { |
---|
1253 | | - if (!writecache_entry_is_committed(wc, e)) |
---|
1254 | | - goto bio_copy; |
---|
1255 | | - if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { |
---|
1256 | | - wc->overwrote_committed = true; |
---|
| 1372 | + if (!writecache_entry_is_committed(wc, e)) { |
---|
| 1373 | + search_used = true; |
---|
1257 | 1374 | goto bio_copy; |
---|
1258 | 1375 | } |
---|
| 1376 | + if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { |
---|
| 1377 | + wc->overwrote_committed = true; |
---|
| 1378 | + search_used = true; |
---|
| 1379 | + goto bio_copy; |
---|
| 1380 | + } |
---|
| 1381 | + found_entry = true; |
---|
| 1382 | + } else { |
---|
| 1383 | + if (unlikely(wc->cleaner)) |
---|
| 1384 | + goto direct_write; |
---|
1259 | 1385 | } |
---|
1260 | | - e = writecache_pop_from_freelist(wc); |
---|
| 1386 | + e = writecache_pop_from_freelist(wc, (sector_t)-1); |
---|
1261 | 1387 | if (unlikely(!e)) { |
---|
| 1388 | + if (!WC_MODE_PMEM(wc) && !found_entry) { |
---|
| 1389 | +direct_write: |
---|
| 1390 | + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); |
---|
| 1391 | + if (e) { |
---|
| 1392 | + sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; |
---|
| 1393 | + BUG_ON(!next_boundary); |
---|
| 1394 | + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { |
---|
| 1395 | + dm_accept_partial_bio(bio, next_boundary); |
---|
| 1396 | + } |
---|
| 1397 | + } |
---|
| 1398 | + goto unlock_remap_origin; |
---|
| 1399 | + } |
---|
1262 | 1400 | writecache_wait_on_freelist(wc); |
---|
1263 | 1401 | continue; |
---|
1264 | 1402 | } |
---|
.. | .. |
---|
1269 | 1407 | if (WC_MODE_PMEM(wc)) { |
---|
1270 | 1408 | bio_copy_block(wc, bio, memory_data(wc, e)); |
---|
1271 | 1409 | } else { |
---|
1272 | | - dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); |
---|
| 1410 | + unsigned bio_size = wc->block_size; |
---|
| 1411 | + sector_t start_cache_sec = cache_sector(wc, e); |
---|
| 1412 | + sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); |
---|
| 1413 | + |
---|
| 1414 | + while (bio_size < bio->bi_iter.bi_size) { |
---|
| 1415 | + if (!search_used) { |
---|
| 1416 | + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); |
---|
| 1417 | + if (!f) |
---|
| 1418 | + break; |
---|
| 1419 | + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + |
---|
| 1420 | + (bio_size >> SECTOR_SHIFT), wc->seq_count); |
---|
| 1421 | + writecache_insert_entry(wc, f); |
---|
| 1422 | + wc->uncommitted_blocks++; |
---|
| 1423 | + } else { |
---|
| 1424 | + struct wc_entry *f; |
---|
| 1425 | + struct rb_node *next = rb_next(&e->rb_node); |
---|
| 1426 | + if (!next) |
---|
| 1427 | + break; |
---|
| 1428 | + f = container_of(next, struct wc_entry, rb_node); |
---|
| 1429 | + if (f != e + 1) |
---|
| 1430 | + break; |
---|
| 1431 | + if (read_original_sector(wc, f) != |
---|
| 1432 | + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) |
---|
| 1433 | + break; |
---|
| 1434 | + if (unlikely(f->write_in_progress)) |
---|
| 1435 | + break; |
---|
| 1436 | + if (writecache_entry_is_committed(wc, f)) |
---|
| 1437 | + wc->overwrote_committed = true; |
---|
| 1438 | + e = f; |
---|
| 1439 | + } |
---|
| 1440 | + bio_size += wc->block_size; |
---|
| 1441 | + current_cache_sec += wc->block_size >> SECTOR_SHIFT; |
---|
| 1442 | + } |
---|
| 1443 | + |
---|
1273 | 1444 | bio_set_dev(bio, wc->ssd_dev->bdev); |
---|
1274 | | - bio->bi_iter.bi_sector = cache_sector(wc, e); |
---|
| 1445 | + bio->bi_iter.bi_sector = start_cache_sec; |
---|
| 1446 | + dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); |
---|
| 1447 | + |
---|
1275 | 1448 | if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { |
---|
1276 | 1449 | wc->uncommitted_blocks = 0; |
---|
1277 | 1450 | queue_work(wc->writeback_wq, &wc->flush_work); |
---|
.. | .. |
---|
1545 | 1718 | bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); |
---|
1546 | 1719 | wb = container_of(bio, struct writeback_struct, bio); |
---|
1547 | 1720 | wb->wc = wc; |
---|
1548 | | - wb->bio.bi_end_io = writecache_writeback_endio; |
---|
1549 | | - bio_set_dev(&wb->bio, wc->dev->bdev); |
---|
1550 | | - wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); |
---|
1551 | | - wb->page_offset = PAGE_SIZE; |
---|
| 1721 | + bio->bi_end_io = writecache_writeback_endio; |
---|
| 1722 | + bio_set_dev(bio, wc->dev->bdev); |
---|
| 1723 | + bio->bi_iter.bi_sector = read_original_sector(wc, e); |
---|
1552 | 1724 | if (max_pages <= WB_LIST_INLINE || |
---|
1553 | 1725 | unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), |
---|
1554 | 1726 | GFP_NOIO | __GFP_NORETRY | |
---|
.. | .. |
---|
1574 | 1746 | wb->wc_list[wb->wc_list_n++] = f; |
---|
1575 | 1747 | e = f; |
---|
1576 | 1748 | } |
---|
1577 | | - bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); |
---|
| 1749 | + bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); |
---|
1578 | 1750 | if (writecache_has_error(wc)) { |
---|
1579 | 1751 | bio->bi_status = BLK_STS_IOERR; |
---|
1580 | | - bio_endio(&wb->bio); |
---|
1581 | | - } else if (unlikely(!bio_sectors(&wb->bio))) { |
---|
| 1752 | + bio_endio(bio); |
---|
| 1753 | + } else if (unlikely(!bio_sectors(bio))) { |
---|
1582 | 1754 | bio->bi_status = BLK_STS_OK; |
---|
1583 | | - bio_endio(&wb->bio); |
---|
| 1755 | + bio_endio(bio); |
---|
1584 | 1756 | } else { |
---|
1585 | | - submit_bio(&wb->bio); |
---|
| 1757 | + submit_bio(bio); |
---|
1586 | 1758 | } |
---|
1587 | 1759 | |
---|
1588 | 1760 | __writeback_throttle(wc, wbl); |
---|
.. | .. |
---|
1642 | 1814 | { |
---|
1643 | 1815 | struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); |
---|
1644 | 1816 | struct blk_plug plug; |
---|
1645 | | - struct wc_entry *e, *f, *g; |
---|
| 1817 | + struct wc_entry *f, *g, *e = NULL; |
---|
1646 | 1818 | struct rb_node *node, *next_node; |
---|
1647 | 1819 | struct list_head skipped; |
---|
1648 | 1820 | struct writeback_list wbl; |
---|
.. | .. |
---|
1670 | 1842 | wbl.size = 0; |
---|
1671 | 1843 | while (!list_empty(&wc->lru) && |
---|
1672 | 1844 | (wc->writeback_all || |
---|
1673 | | - wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) { |
---|
| 1845 | + wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || |
---|
| 1846 | + (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= |
---|
| 1847 | + wc->max_age - wc->max_age / MAX_AGE_DIV))) { |
---|
1674 | 1848 | |
---|
1675 | 1849 | n_walked++; |
---|
1676 | 1850 | if (unlikely(n_walked > WRITEBACK_LATENCY) && |
---|
.. | .. |
---|
1679 | 1853 | break; |
---|
1680 | 1854 | } |
---|
1681 | 1855 | |
---|
1682 | | - e = container_of(wc->lru.prev, struct wc_entry, lru); |
---|
| 1856 | + if (unlikely(wc->writeback_all)) { |
---|
| 1857 | + if (unlikely(!e)) { |
---|
| 1858 | + writecache_flush(wc); |
---|
| 1859 | + e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); |
---|
| 1860 | + } else |
---|
| 1861 | + e = g; |
---|
| 1862 | + } else |
---|
| 1863 | + e = container_of(wc->lru.prev, struct wc_entry, lru); |
---|
1683 | 1864 | BUG_ON(e->write_in_progress); |
---|
1684 | 1865 | if (unlikely(!writecache_entry_is_committed(wc, e))) { |
---|
1685 | 1866 | writecache_flush(wc); |
---|
.. | .. |
---|
1710 | 1891 | if (unlikely(!next_node)) |
---|
1711 | 1892 | break; |
---|
1712 | 1893 | g = container_of(next_node, struct wc_entry, rb_node); |
---|
1713 | | - if (read_original_sector(wc, g) == |
---|
1714 | | - read_original_sector(wc, f)) { |
---|
| 1894 | + if (unlikely(read_original_sector(wc, g) == |
---|
| 1895 | + read_original_sector(wc, f))) { |
---|
1715 | 1896 | f = g; |
---|
1716 | 1897 | continue; |
---|
1717 | 1898 | } |
---|
.. | .. |
---|
1740 | 1921 | g->wc_list_contiguous = BIO_MAX_PAGES; |
---|
1741 | 1922 | f = g; |
---|
1742 | 1923 | e->wc_list_contiguous++; |
---|
1743 | | - if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) |
---|
| 1924 | + if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { |
---|
| 1925 | + if (unlikely(wc->writeback_all)) { |
---|
| 1926 | + next_node = rb_next(&f->rb_node); |
---|
| 1927 | + if (likely(next_node)) |
---|
| 1928 | + g = container_of(next_node, struct wc_entry, rb_node); |
---|
| 1929 | + } |
---|
1744 | 1930 | break; |
---|
| 1931 | + } |
---|
1745 | 1932 | } |
---|
1746 | 1933 | cond_resched(); |
---|
1747 | 1934 | } |
---|
.. | .. |
---|
1922 | 2109 | wc->ti = ti; |
---|
1923 | 2110 | |
---|
1924 | 2111 | mutex_init(&wc->lock); |
---|
| 2112 | + wc->max_age = MAX_AGE_UNSPECIFIED; |
---|
1925 | 2113 | writecache_poison_lists(wc); |
---|
1926 | 2114 | init_waitqueue_head(&wc->freelist_wait); |
---|
1927 | 2115 | timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); |
---|
| 2116 | + timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); |
---|
1928 | 2117 | |
---|
1929 | 2118 | for (i = 0; i < 2; i++) { |
---|
1930 | 2119 | atomic_set(&wc->bio_in_progress[i], 0); |
---|
.. | .. |
---|
1939 | 2128 | goto bad; |
---|
1940 | 2129 | } |
---|
1941 | 2130 | |
---|
1942 | | - wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1); |
---|
| 2131 | + wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); |
---|
1943 | 2132 | if (!wc->writeback_wq) { |
---|
1944 | 2133 | r = -ENOMEM; |
---|
1945 | 2134 | ti->error = "Could not allocate writeback workqueue"; |
---|
.. | .. |
---|
2108 | 2297 | wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); |
---|
2109 | 2298 | wc->autocommit_time_value = autocommit_msecs; |
---|
2110 | 2299 | wc->autocommit_time_set = true; |
---|
| 2300 | + } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { |
---|
| 2301 | + unsigned max_age_msecs; |
---|
| 2302 | + string = dm_shift_arg(&as), opt_params--; |
---|
| 2303 | + if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) |
---|
| 2304 | + goto invalid_optional; |
---|
| 2305 | + if (max_age_msecs > 86400000) |
---|
| 2306 | + goto invalid_optional; |
---|
| 2307 | + wc->max_age = msecs_to_jiffies(max_age_msecs); |
---|
| 2308 | + wc->max_age_set = true; |
---|
| 2309 | + wc->max_age_value = max_age_msecs; |
---|
| 2310 | + } else if (!strcasecmp(string, "cleaner")) { |
---|
| 2311 | + wc->cleaner_set = true; |
---|
| 2312 | + wc->cleaner = true; |
---|
2111 | 2313 | } else if (!strcasecmp(string, "fua")) { |
---|
2112 | 2314 | if (WC_MODE_PMEM(wc)) { |
---|
2113 | 2315 | wc->writeback_fua = true; |
---|
.. | .. |
---|
2133 | 2335 | } |
---|
2134 | 2336 | |
---|
2135 | 2337 | if (WC_MODE_PMEM(wc)) { |
---|
| 2338 | + if (!dax_synchronous(wc->ssd_dev->dax_dev)) { |
---|
| 2339 | + r = -EOPNOTSUPP; |
---|
| 2340 | + ti->error = "Asynchronous persistent memory not supported as pmem cache"; |
---|
| 2341 | + goto bad; |
---|
| 2342 | + } |
---|
| 2343 | + |
---|
2136 | 2344 | r = persistent_memory_claim(wc); |
---|
2137 | 2345 | if (r) { |
---|
2138 | 2346 | ti->error = "Unable to map persistent memory for cache"; |
---|
.. | .. |
---|
2149 | 2357 | if (IS_ERR(wc->flush_thread)) { |
---|
2150 | 2358 | r = PTR_ERR(wc->flush_thread); |
---|
2151 | 2359 | wc->flush_thread = NULL; |
---|
2152 | | - ti->error = "Couldn't spawn endio thread"; |
---|
| 2360 | + ti->error = "Couldn't spawn flush thread"; |
---|
2153 | 2361 | goto bad; |
---|
2154 | 2362 | } |
---|
2155 | 2363 | wake_up_process(wc->flush_thread); |
---|
.. | .. |
---|
2202 | 2410 | } |
---|
2203 | 2411 | } |
---|
2204 | 2412 | |
---|
2205 | | - r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); |
---|
| 2413 | + r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); |
---|
2206 | 2414 | if (r) { |
---|
2207 | 2415 | ti->error = "Hardware memory error when reading superblock"; |
---|
2208 | 2416 | goto bad; |
---|
.. | .. |
---|
2213 | 2421 | ti->error = "Unable to initialize device"; |
---|
2214 | 2422 | goto bad; |
---|
2215 | 2423 | } |
---|
2216 | | - r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); |
---|
| 2424 | + r = copy_mc_to_kernel(&s, sb(wc), |
---|
| 2425 | + sizeof(struct wc_memory_superblock)); |
---|
2217 | 2426 | if (r) { |
---|
2218 | 2427 | ti->error = "Hardware memory error when reading superblock"; |
---|
2219 | 2428 | goto bad; |
---|
.. | .. |
---|
2273 | 2482 | do_div(x, 100); |
---|
2274 | 2483 | wc->freelist_low_watermark = x; |
---|
2275 | 2484 | |
---|
| 2485 | + if (wc->cleaner) |
---|
| 2486 | + activate_cleaner(wc); |
---|
| 2487 | + |
---|
2276 | 2488 | r = writecache_alloc_entries(wc); |
---|
2277 | 2489 | if (r) { |
---|
2278 | 2490 | ti->error = "Cannot allocate memory"; |
---|
2279 | 2491 | goto bad; |
---|
2280 | 2492 | } |
---|
2281 | 2493 | |
---|
2282 | | - ti->num_flush_bios = 1; |
---|
| 2494 | + ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; |
---|
2283 | 2495 | ti->flush_supported = true; |
---|
2284 | 2496 | ti->num_discard_bios = 1; |
---|
2285 | 2497 | |
---|
.. | .. |
---|
2325 | 2537 | extra_args += 2; |
---|
2326 | 2538 | if (wc->autocommit_time_set) |
---|
2327 | 2539 | extra_args += 2; |
---|
| 2540 | + if (wc->max_age_set) |
---|
| 2541 | + extra_args += 2; |
---|
| 2542 | + if (wc->cleaner_set) |
---|
| 2543 | + extra_args++; |
---|
2328 | 2544 | if (wc->writeback_fua_set) |
---|
2329 | 2545 | extra_args++; |
---|
2330 | 2546 | |
---|
.. | .. |
---|
2341 | 2557 | DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); |
---|
2342 | 2558 | if (wc->autocommit_time_set) |
---|
2343 | 2559 | DMEMIT(" autocommit_time %u", wc->autocommit_time_value); |
---|
| 2560 | + if (wc->max_age_set) |
---|
| 2561 | + DMEMIT(" max_age %u", wc->max_age_value); |
---|
| 2562 | + if (wc->cleaner_set) |
---|
| 2563 | + DMEMIT(" cleaner"); |
---|
2344 | 2564 | if (wc->writeback_fua_set) |
---|
2345 | 2565 | DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); |
---|
2346 | 2566 | break; |
---|
.. | .. |
---|
2349 | 2569 | |
---|
2350 | 2570 | static struct target_type writecache_target = { |
---|
2351 | 2571 | .name = "writecache", |
---|
2352 | | - .version = {1, 1, 1}, |
---|
| 2572 | + .version = {1, 4, 0}, |
---|
2353 | 2573 | .module = THIS_MODULE, |
---|
2354 | 2574 | .ctr = writecache_ctr, |
---|
2355 | 2575 | .dtr = writecache_dtr, |
---|