hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/drivers/md/dm-writecache.c
....@@ -20,12 +20,14 @@
2020
2121 #define HIGH_WATERMARK 50
2222 #define LOW_WATERMARK 45
23
-#define MAX_WRITEBACK_JOBS 0
23
+#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
2424 #define ENDIO_LATENCY 16
2525 #define WRITEBACK_LATENCY 64
2626 #define AUTOCOMMIT_BLOCKS_SSD 65536
2727 #define AUTOCOMMIT_BLOCKS_PMEM 64
2828 #define AUTOCOMMIT_MSEC 1000
29
+#define MAX_AGE_DIV 16
30
+#define MAX_AGE_UNSPECIFIED -1UL
2931
3032 #define BITMAP_GRANULARITY 65536
3133 #if BITMAP_GRANULARITY < PAGE_SIZE
....@@ -47,7 +49,7 @@
4749 #define pmem_assign(dest, src) ((dest) = (src))
4850 #endif
4951
50
-#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
52
+#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
5153 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
5254 #endif
5355
....@@ -88,6 +90,7 @@
8890 :47
8991 #endif
9092 ;
93
+ unsigned long age;
9194 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
9295 uint64_t original_sector;
9396 uint64_t seq_count;
....@@ -119,6 +122,7 @@
119122 size_t writeback_size;
120123 size_t freelist_high_watermark;
121124 size_t freelist_low_watermark;
125
+ unsigned long max_age;
122126
123127 unsigned uncommitted_blocks;
124128 unsigned autocommit_blocks;
....@@ -129,6 +133,8 @@
129133 unsigned long autocommit_jiffies;
130134 struct timer_list autocommit_timer;
131135 struct wait_queue_head freelist_wait;
136
+
137
+ struct timer_list max_age_timer;
132138
133139 atomic_t bio_in_progress[2];
134140 struct wait_queue_head bio_in_progress_wait[2];
....@@ -160,12 +166,16 @@
160166 bool max_writeback_jobs_set:1;
161167 bool autocommit_blocks_set:1;
162168 bool autocommit_time_set:1;
169
+ bool max_age_set:1;
163170 bool writeback_fua_set:1;
164171 bool flush_on_suspend:1;
172
+ bool cleaner:1;
173
+ bool cleaner_set:1;
165174
166175 unsigned high_wm_percent_value;
167176 unsigned low_wm_percent_value;
168177 unsigned autocommit_time_value;
178
+ unsigned max_age_value;
169179
170180 unsigned writeback_all;
171181 struct workqueue_struct *writeback_wq;
....@@ -196,8 +206,6 @@
196206 struct dm_writecache *wc;
197207 struct wc_entry **wc_list;
198208 unsigned wc_list_n;
199
- unsigned page_offset;
200
- struct page *page;
201209 struct wc_entry *wc_list_inline[WB_LIST_INLINE];
202210 struct bio bio;
203211 };
....@@ -236,10 +244,6 @@
236244
237245 wc->memory_vmapped = false;
238246
239
- if (!wc->ssd_dev->dax_dev) {
240
- r = -EOPNOTSUPP;
241
- goto err1;
242
- }
243247 s = wc->memory_map_size;
244248 p = s >> PAGE_SHIFT;
245249 if (!p) {
....@@ -366,10 +370,7 @@
366370
367371 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
368372 {
369
- if (is_power_of_2(sizeof(struct wc_entry)) && 0)
370
- return &sb(wc)->entries[e - wc->entries];
371
- else
372
- return &sb(wc)->entries[e->index];
373
+ return &sb(wc)->entries[e->index];
373374 }
374375
375376 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
....@@ -523,10 +524,38 @@
523524 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
524525 }
525526
527
+static void ssd_commit_superblock(struct dm_writecache *wc)
528
+{
529
+ int r;
530
+ struct dm_io_region region;
531
+ struct dm_io_request req;
532
+
533
+ region.bdev = wc->ssd_dev->bdev;
534
+ region.sector = 0;
535
+ region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
536
+
537
+ if (unlikely(region.sector + region.count > wc->metadata_sectors))
538
+ region.count = wc->metadata_sectors - region.sector;
539
+
540
+ region.sector += wc->start_sector;
541
+
542
+ req.bi_op = REQ_OP_WRITE;
543
+ req.bi_op_flags = REQ_SYNC | REQ_FUA;
544
+ req.mem.type = DM_IO_VMA;
545
+ req.mem.ptr.vma = (char *)wc->memory_map;
546
+ req.client = wc->dm_io;
547
+ req.notify.fn = NULL;
548
+ req.notify.context = NULL;
549
+
550
+ r = dm_io(&req, 1, &region, NULL);
551
+ if (unlikely(r))
552
+ writecache_error(wc, r, "error writing superblock");
553
+}
554
+
526555 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
527556 {
528557 if (WC_MODE_PMEM(wc))
529
- wmb();
558
+ pmem_wmb();
530559 else
531560 ssd_commit_flushed(wc, wait_for_ios);
532561 }
....@@ -568,21 +597,20 @@
568597 e = container_of(node, struct wc_entry, rb_node);
569598 if (read_original_sector(wc, e) == block)
570599 break;
600
+
571601 node = (read_original_sector(wc, e) >= block ?
572602 e->rb_node.rb_left : e->rb_node.rb_right);
573603 if (unlikely(!node)) {
574
- if (!(flags & WFE_RETURN_FOLLOWING)) {
604
+ if (!(flags & WFE_RETURN_FOLLOWING))
575605 return NULL;
576
- }
577606 if (read_original_sector(wc, e) >= block) {
578
- break;
607
+ return e;
579608 } else {
580609 node = rb_next(&e->rb_node);
581
- if (unlikely(!node)) {
610
+ if (unlikely(!node))
582611 return NULL;
583
- }
584612 e = container_of(node, struct wc_entry, rb_node);
585
- break;
613
+ return e;
586614 }
587615 }
588616 }
....@@ -593,7 +621,7 @@
593621 node = rb_prev(&e->rb_node);
594622 else
595623 node = rb_next(&e->rb_node);
596
- if (!node)
624
+ if (unlikely(!node))
597625 return e;
598626 e2 = container_of(node, struct wc_entry, rb_node);
599627 if (read_original_sector(wc, e2) != block)
....@@ -618,6 +646,7 @@
618646 rb_link_node(&ins->rb_node, parent, node);
619647 rb_insert_color(&ins->rb_node, &wc->tree);
620648 list_add(&ins->lru, &wc->lru);
649
+ ins->age = jiffies;
621650 }
622651
623652 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
....@@ -653,7 +682,17 @@
653682 queue_work(wc->writeback_wq, &wc->writeback_work);
654683 }
655684
656
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
685
+static void writecache_max_age_timer(struct timer_list *t)
686
+{
687
+ struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
688
+
689
+ if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
690
+ queue_work(wc->writeback_wq, &wc->writeback_work);
691
+ mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
692
+ }
693
+}
694
+
695
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
657696 {
658697 struct wc_entry *e;
659698
....@@ -662,6 +701,8 @@
662701 if (unlikely(!wc->current_free))
663702 return NULL;
664703 e = wc->current_free;
704
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
705
+ return NULL;
665706 next = rb_next(&e->rb_node);
666707 rb_erase(&e->rb_node, &wc->freetree);
667708 if (unlikely(!next))
....@@ -671,6 +712,8 @@
671712 if (unlikely(list_empty(&wc->freelist)))
672713 return NULL;
673714 e = container_of(wc->freelist.next, struct wc_entry, lru);
715
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
716
+ return NULL;
674717 list_del(&e->lru);
675718 }
676719 wc->freelist_size--;
....@@ -759,8 +802,10 @@
759802
760803 wc->seq_count++;
761804 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
762
- writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
763
- writecache_commit_flushed(wc, false);
805
+ if (WC_MODE_PMEM(wc))
806
+ writecache_commit_flushed(wc, false);
807
+ else
808
+ ssd_commit_superblock(wc);
764809
765810 wc->overwrote_committed = false;
766811
....@@ -823,8 +868,10 @@
823868
824869 if (likely(!e->write_in_progress)) {
825870 if (!discarded_something) {
826
- writecache_wait_for_ios(wc, READ);
827
- writecache_wait_for_ios(wc, WRITE);
871
+ if (!WC_MODE_PMEM(wc)) {
872
+ writecache_wait_for_ios(wc, READ);
873
+ writecache_wait_for_ios(wc, WRITE);
874
+ }
828875 discarded_something = true;
829876 }
830877 if (!writecache_entry_is_committed(wc, e))
....@@ -832,7 +879,7 @@
832879 writecache_free_entry(wc, e);
833880 }
834881
835
- if (!node)
882
+ if (unlikely(!node))
836883 break;
837884
838885 e = container_of(node, struct wc_entry, rb_node);
....@@ -857,6 +904,7 @@
857904 bool flush_on_suspend;
858905
859906 del_timer_sync(&wc->autocommit_timer);
907
+ del_timer_sync(&wc->max_age_timer);
860908
861909 wc_lock(wc);
862910 writecache_flush(wc);
....@@ -955,7 +1003,8 @@
9551003 }
9561004 wc->freelist_size = 0;
9571005
958
- r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
1006
+ r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1007
+ sizeof(uint64_t));
9591008 if (r) {
9601009 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
9611010 sb_seq_count = cpu_to_le64(0);
....@@ -971,7 +1020,8 @@
9711020 e->seq_count = -1;
9721021 continue;
9731022 }
974
- r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
1023
+ r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1024
+ sizeof(struct wc_memory_entry));
9751025 if (r) {
9761026 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
9771027 (unsigned long)b, r);
....@@ -1025,6 +1075,9 @@
10251075
10261076 writecache_verify_watermark(wc);
10271077
1078
+ if (wc->max_age != MAX_AGE_UNSPECIFIED)
1079
+ mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1080
+
10281081 wc_unlock(wc);
10291082 }
10301083
....@@ -1073,6 +1126,28 @@
10731126 return 0;
10741127 }
10751128
1129
+static void activate_cleaner(struct dm_writecache *wc)
1130
+{
1131
+ wc->flush_on_suspend = true;
1132
+ wc->cleaner = true;
1133
+ wc->freelist_high_watermark = wc->n_blocks;
1134
+ wc->freelist_low_watermark = wc->n_blocks;
1135
+}
1136
+
1137
+static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1138
+{
1139
+ if (argc != 1)
1140
+ return -EINVAL;
1141
+
1142
+ wc_lock(wc);
1143
+ activate_cleaner(wc);
1144
+ if (!dm_suspended(wc->ti))
1145
+ writecache_verify_watermark(wc);
1146
+ wc_unlock(wc);
1147
+
1148
+ return 0;
1149
+}
1150
+
10761151 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
10771152 char *result, unsigned maxlen)
10781153 {
....@@ -1083,10 +1158,48 @@
10831158 r = process_flush_mesg(argc, argv, wc);
10841159 else if (!strcasecmp(argv[0], "flush_on_suspend"))
10851160 r = process_flush_on_suspend_mesg(argc, argv, wc);
1161
+ else if (!strcasecmp(argv[0], "cleaner"))
1162
+ r = process_cleaner_mesg(argc, argv, wc);
10861163 else
10871164 DMERR("unrecognised message received: %s", argv[0]);
10881165
10891166 return r;
1167
+}
1168
+
1169
+static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1170
+{
1171
+ /*
1172
+ * clflushopt performs better with block size 1024, 2048, 4096
1173
+ * non-temporal stores perform better with block size 512
1174
+ *
1175
+ * block size 512 1024 2048 4096
1176
+ * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
1177
+ * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
1178
+ *
1179
+ * We see that movnti performs better for 512-byte blocks, and
1180
+ * clflushopt performs better for 1024-byte and larger blocks. So, we
1181
+ * prefer clflushopt for sizes >= 768.
1182
+ *
1183
+ * NOTE: this happens to be the case now (with dm-writecache's single
1184
+ * threaded model) but re-evaluate this once memcpy_flushcache() is
1185
+ * enabled to use movdir64b which might invalidate this performance
1186
+ * advantage seen with cache-allocating-writes plus flushing.
1187
+ */
1188
+#ifdef CONFIG_X86
1189
+ if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1190
+ likely(boot_cpu_data.x86_clflush_size == 64) &&
1191
+ likely(size >= 768)) {
1192
+ do {
1193
+ memcpy((void *)dest, (void *)source, 64);
1194
+ clflushopt((void *)dest);
1195
+ dest += 64;
1196
+ source += 64;
1197
+ size -= 64;
1198
+ } while (size >= 64);
1199
+ return;
1200
+ }
1201
+#endif
1202
+ memcpy_flushcache(dest, source, size);
10901203 }
10911204
10921205 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
....@@ -1106,7 +1219,7 @@
11061219
11071220 if (rw == READ) {
11081221 int r;
1109
- r = memcpy_mcsafe(buf, data, size);
1222
+ r = copy_mc_to_kernel(buf, data, size);
11101223 flush_dcache_page(bio_page(bio));
11111224 if (unlikely(r)) {
11121225 writecache_error(wc, r, "hardware memory error when reading data: %d", r);
....@@ -1114,7 +1227,7 @@
11141227 }
11151228 } else {
11161229 flush_dcache_page(bio_page(bio));
1117
- memcpy_flushcache(data, buf, size);
1230
+ memcpy_flushcache_optimized(data, buf, size);
11181231 }
11191232
11201233 bvec_kunmap_irq(buf, &flags);
....@@ -1152,7 +1265,7 @@
11521265 bio_end_sector(bio));
11531266 wc_unlock(wc);
11541267 bio_set_dev(bio, wc->dev->bdev);
1155
- generic_make_request(bio);
1268
+ submit_bio_noacct(bio);
11561269 } else {
11571270 writecache_flush(wc);
11581271 wc_unlock(wc);
....@@ -1188,8 +1301,12 @@
11881301 writecache_flush(wc);
11891302 if (writecache_has_error(wc))
11901303 goto unlock_error;
1304
+ if (unlikely(wc->cleaner))
1305
+ goto unlock_remap_origin;
11911306 goto unlock_submit;
11921307 } else {
1308
+ if (dm_bio_get_target_bio_nr(bio))
1309
+ goto unlock_remap_origin;
11931310 writecache_offload_bio(wc, bio);
11941311 goto unlock_return;
11951312 }
....@@ -1246,19 +1363,40 @@
12461363 }
12471364 } else {
12481365 do {
1366
+ bool found_entry = false;
1367
+ bool search_used = false;
12491368 if (writecache_has_error(wc))
12501369 goto unlock_error;
12511370 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
12521371 if (e) {
1253
- if (!writecache_entry_is_committed(wc, e))
1254
- goto bio_copy;
1255
- if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1256
- wc->overwrote_committed = true;
1372
+ if (!writecache_entry_is_committed(wc, e)) {
1373
+ search_used = true;
12571374 goto bio_copy;
12581375 }
1376
+ if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1377
+ wc->overwrote_committed = true;
1378
+ search_used = true;
1379
+ goto bio_copy;
1380
+ }
1381
+ found_entry = true;
1382
+ } else {
1383
+ if (unlikely(wc->cleaner))
1384
+ goto direct_write;
12591385 }
1260
- e = writecache_pop_from_freelist(wc);
1386
+ e = writecache_pop_from_freelist(wc, (sector_t)-1);
12611387 if (unlikely(!e)) {
1388
+ if (!WC_MODE_PMEM(wc) && !found_entry) {
1389
+direct_write:
1390
+ e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1391
+ if (e) {
1392
+ sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1393
+ BUG_ON(!next_boundary);
1394
+ if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1395
+ dm_accept_partial_bio(bio, next_boundary);
1396
+ }
1397
+ }
1398
+ goto unlock_remap_origin;
1399
+ }
12621400 writecache_wait_on_freelist(wc);
12631401 continue;
12641402 }
....@@ -1269,9 +1407,44 @@
12691407 if (WC_MODE_PMEM(wc)) {
12701408 bio_copy_block(wc, bio, memory_data(wc, e));
12711409 } else {
1272
- dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1410
+ unsigned bio_size = wc->block_size;
1411
+ sector_t start_cache_sec = cache_sector(wc, e);
1412
+ sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1413
+
1414
+ while (bio_size < bio->bi_iter.bi_size) {
1415
+ if (!search_used) {
1416
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1417
+ if (!f)
1418
+ break;
1419
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1420
+ (bio_size >> SECTOR_SHIFT), wc->seq_count);
1421
+ writecache_insert_entry(wc, f);
1422
+ wc->uncommitted_blocks++;
1423
+ } else {
1424
+ struct wc_entry *f;
1425
+ struct rb_node *next = rb_next(&e->rb_node);
1426
+ if (!next)
1427
+ break;
1428
+ f = container_of(next, struct wc_entry, rb_node);
1429
+ if (f != e + 1)
1430
+ break;
1431
+ if (read_original_sector(wc, f) !=
1432
+ read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1433
+ break;
1434
+ if (unlikely(f->write_in_progress))
1435
+ break;
1436
+ if (writecache_entry_is_committed(wc, f))
1437
+ wc->overwrote_committed = true;
1438
+ e = f;
1439
+ }
1440
+ bio_size += wc->block_size;
1441
+ current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1442
+ }
1443
+
12731444 bio_set_dev(bio, wc->ssd_dev->bdev);
1274
- bio->bi_iter.bi_sector = cache_sector(wc, e);
1445
+ bio->bi_iter.bi_sector = start_cache_sec;
1446
+ dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1447
+
12751448 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
12761449 wc->uncommitted_blocks = 0;
12771450 queue_work(wc->writeback_wq, &wc->flush_work);
....@@ -1545,10 +1718,9 @@
15451718 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
15461719 wb = container_of(bio, struct writeback_struct, bio);
15471720 wb->wc = wc;
1548
- wb->bio.bi_end_io = writecache_writeback_endio;
1549
- bio_set_dev(&wb->bio, wc->dev->bdev);
1550
- wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
1551
- wb->page_offset = PAGE_SIZE;
1721
+ bio->bi_end_io = writecache_writeback_endio;
1722
+ bio_set_dev(bio, wc->dev->bdev);
1723
+ bio->bi_iter.bi_sector = read_original_sector(wc, e);
15521724 if (max_pages <= WB_LIST_INLINE ||
15531725 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
15541726 GFP_NOIO | __GFP_NORETRY |
....@@ -1574,15 +1746,15 @@
15741746 wb->wc_list[wb->wc_list_n++] = f;
15751747 e = f;
15761748 }
1577
- bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1749
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
15781750 if (writecache_has_error(wc)) {
15791751 bio->bi_status = BLK_STS_IOERR;
1580
- bio_endio(&wb->bio);
1581
- } else if (unlikely(!bio_sectors(&wb->bio))) {
1752
+ bio_endio(bio);
1753
+ } else if (unlikely(!bio_sectors(bio))) {
15821754 bio->bi_status = BLK_STS_OK;
1583
- bio_endio(&wb->bio);
1755
+ bio_endio(bio);
15841756 } else {
1585
- submit_bio(&wb->bio);
1757
+ submit_bio(bio);
15861758 }
15871759
15881760 __writeback_throttle(wc, wbl);
....@@ -1642,7 +1814,7 @@
16421814 {
16431815 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
16441816 struct blk_plug plug;
1645
- struct wc_entry *e, *f, *g;
1817
+ struct wc_entry *f, *g, *e = NULL;
16461818 struct rb_node *node, *next_node;
16471819 struct list_head skipped;
16481820 struct writeback_list wbl;
....@@ -1670,7 +1842,9 @@
16701842 wbl.size = 0;
16711843 while (!list_empty(&wc->lru) &&
16721844 (wc->writeback_all ||
1673
- wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
1845
+ wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1846
+ (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1847
+ wc->max_age - wc->max_age / MAX_AGE_DIV))) {
16741848
16751849 n_walked++;
16761850 if (unlikely(n_walked > WRITEBACK_LATENCY) &&
....@@ -1679,7 +1853,14 @@
16791853 break;
16801854 }
16811855
1682
- e = container_of(wc->lru.prev, struct wc_entry, lru);
1856
+ if (unlikely(wc->writeback_all)) {
1857
+ if (unlikely(!e)) {
1858
+ writecache_flush(wc);
1859
+ e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1860
+ } else
1861
+ e = g;
1862
+ } else
1863
+ e = container_of(wc->lru.prev, struct wc_entry, lru);
16831864 BUG_ON(e->write_in_progress);
16841865 if (unlikely(!writecache_entry_is_committed(wc, e))) {
16851866 writecache_flush(wc);
....@@ -1710,8 +1891,8 @@
17101891 if (unlikely(!next_node))
17111892 break;
17121893 g = container_of(next_node, struct wc_entry, rb_node);
1713
- if (read_original_sector(wc, g) ==
1714
- read_original_sector(wc, f)) {
1894
+ if (unlikely(read_original_sector(wc, g) ==
1895
+ read_original_sector(wc, f))) {
17151896 f = g;
17161897 continue;
17171898 }
....@@ -1740,8 +1921,14 @@
17401921 g->wc_list_contiguous = BIO_MAX_PAGES;
17411922 f = g;
17421923 e->wc_list_contiguous++;
1743
- if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
1924
+ if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
1925
+ if (unlikely(wc->writeback_all)) {
1926
+ next_node = rb_next(&f->rb_node);
1927
+ if (likely(next_node))
1928
+ g = container_of(next_node, struct wc_entry, rb_node);
1929
+ }
17441930 break;
1931
+ }
17451932 }
17461933 cond_resched();
17471934 }
....@@ -1922,9 +2109,11 @@
19222109 wc->ti = ti;
19232110
19242111 mutex_init(&wc->lock);
2112
+ wc->max_age = MAX_AGE_UNSPECIFIED;
19252113 writecache_poison_lists(wc);
19262114 init_waitqueue_head(&wc->freelist_wait);
19272115 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2116
+ timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
19282117
19292118 for (i = 0; i < 2; i++) {
19302119 atomic_set(&wc->bio_in_progress[i], 0);
....@@ -1939,7 +2128,7 @@
19392128 goto bad;
19402129 }
19412130
1942
- wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
2131
+ wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
19432132 if (!wc->writeback_wq) {
19442133 r = -ENOMEM;
19452134 ti->error = "Could not allocate writeback workqueue";
....@@ -2108,6 +2297,19 @@
21082297 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
21092298 wc->autocommit_time_value = autocommit_msecs;
21102299 wc->autocommit_time_set = true;
2300
+ } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2301
+ unsigned max_age_msecs;
2302
+ string = dm_shift_arg(&as), opt_params--;
2303
+ if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2304
+ goto invalid_optional;
2305
+ if (max_age_msecs > 86400000)
2306
+ goto invalid_optional;
2307
+ wc->max_age = msecs_to_jiffies(max_age_msecs);
2308
+ wc->max_age_set = true;
2309
+ wc->max_age_value = max_age_msecs;
2310
+ } else if (!strcasecmp(string, "cleaner")) {
2311
+ wc->cleaner_set = true;
2312
+ wc->cleaner = true;
21112313 } else if (!strcasecmp(string, "fua")) {
21122314 if (WC_MODE_PMEM(wc)) {
21132315 wc->writeback_fua = true;
....@@ -2133,6 +2335,12 @@
21332335 }
21342336
21352337 if (WC_MODE_PMEM(wc)) {
2338
+ if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2339
+ r = -EOPNOTSUPP;
2340
+ ti->error = "Asynchronous persistent memory not supported as pmem cache";
2341
+ goto bad;
2342
+ }
2343
+
21362344 r = persistent_memory_claim(wc);
21372345 if (r) {
21382346 ti->error = "Unable to map persistent memory for cache";
....@@ -2149,7 +2357,7 @@
21492357 if (IS_ERR(wc->flush_thread)) {
21502358 r = PTR_ERR(wc->flush_thread);
21512359 wc->flush_thread = NULL;
2152
- ti->error = "Couldn't spawn endio thread";
2360
+ ti->error = "Couldn't spawn flush thread";
21532361 goto bad;
21542362 }
21552363 wake_up_process(wc->flush_thread);
....@@ -2202,7 +2410,7 @@
22022410 }
22032411 }
22042412
2205
- r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2413
+ r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
22062414 if (r) {
22072415 ti->error = "Hardware memory error when reading superblock";
22082416 goto bad;
....@@ -2213,7 +2421,8 @@
22132421 ti->error = "Unable to initialize device";
22142422 goto bad;
22152423 }
2216
- r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2424
+ r = copy_mc_to_kernel(&s, sb(wc),
2425
+ sizeof(struct wc_memory_superblock));
22172426 if (r) {
22182427 ti->error = "Hardware memory error when reading superblock";
22192428 goto bad;
....@@ -2273,13 +2482,16 @@
22732482 do_div(x, 100);
22742483 wc->freelist_low_watermark = x;
22752484
2485
+ if (wc->cleaner)
2486
+ activate_cleaner(wc);
2487
+
22762488 r = writecache_alloc_entries(wc);
22772489 if (r) {
22782490 ti->error = "Cannot allocate memory";
22792491 goto bad;
22802492 }
22812493
2282
- ti->num_flush_bios = 1;
2494
+ ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
22832495 ti->flush_supported = true;
22842496 ti->num_discard_bios = 1;
22852497
....@@ -2325,6 +2537,10 @@
23252537 extra_args += 2;
23262538 if (wc->autocommit_time_set)
23272539 extra_args += 2;
2540
+ if (wc->max_age_set)
2541
+ extra_args += 2;
2542
+ if (wc->cleaner_set)
2543
+ extra_args++;
23282544 if (wc->writeback_fua_set)
23292545 extra_args++;
23302546
....@@ -2341,6 +2557,10 @@
23412557 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
23422558 if (wc->autocommit_time_set)
23432559 DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2560
+ if (wc->max_age_set)
2561
+ DMEMIT(" max_age %u", wc->max_age_value);
2562
+ if (wc->cleaner_set)
2563
+ DMEMIT(" cleaner");
23442564 if (wc->writeback_fua_set)
23452565 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
23462566 break;
....@@ -2349,7 +2569,7 @@
23492569
23502570 static struct target_type writecache_target = {
23512571 .name = "writecache",
2352
- .version = {1, 1, 1},
2572
+ .version = {1, 4, 0},
23532573 .module = THIS_MODULE,
23542574 .ctr = writecache_ctr,
23552575 .dtr = writecache_dtr,