hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/drivers/nvdimm/pmem.c
....@@ -1,21 +1,12 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Persistent Memory Driver
34 *
45 * Copyright (c) 2014-2015, Intel Corporation.
56 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
67 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
7
- *
8
- * This program is free software; you can redistribute it and/or modify it
9
- * under the terms and conditions of the GNU General Public License,
10
- * version 2, as published by the Free Software Foundation.
11
- *
12
- * This program is distributed in the hope it will be useful, but WITHOUT
13
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15
- * more details.
168 */
179
18
-#include <asm/cacheflush.h>
1910 #include <linux/blkdev.h>
2011 #include <linux/hdreg.h>
2112 #include <linux/init.h>
....@@ -33,10 +24,11 @@
3324 #include <linux/dax.h>
3425 #include <linux/nd.h>
3526 #include <linux/backing-dev.h>
27
+#include <linux/mm.h>
28
+#include <asm/cacheflush.h>
3629 #include "pmem.h"
3730 #include "pfn.h"
3831 #include "nd.h"
39
-#include "nd-core.h"
4032
4133 static struct device *to_dev(struct pmem_device *pmem)
4234 {
....@@ -133,7 +125,7 @@
133125 while (len) {
134126 mem = kmap_atomic(page);
135127 chunk = min_t(unsigned int, len, PAGE_SIZE - off);
136
- rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
128
+ rem = copy_mc_to_kernel(mem + off, pmem_addr, chunk);
137129 kunmap_atomic(mem);
138130 if (rem)
139131 return BLK_STS_IOERR;
....@@ -145,9 +137,25 @@
145137 return BLK_STS_OK;
146138 }
147139
148
-static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
149
- unsigned int len, unsigned int off, unsigned int op,
150
- sector_t sector)
140
+static blk_status_t pmem_do_read(struct pmem_device *pmem,
141
+ struct page *page, unsigned int page_off,
142
+ sector_t sector, unsigned int len)
143
+{
144
+ blk_status_t rc;
145
+ phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
146
+ void *pmem_addr = pmem->virt_addr + pmem_off;
147
+
148
+ if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
149
+ return BLK_STS_IOERR;
150
+
151
+ rc = read_pmem(page, page_off, pmem_addr, len);
152
+ flush_dcache_page(page);
153
+ return rc;
154
+}
155
+
156
+static blk_status_t pmem_do_write(struct pmem_device *pmem,
157
+ struct page *page, unsigned int page_off,
158
+ sector_t sector, unsigned int len)
151159 {
152160 blk_status_t rc = BLK_STS_OK;
153161 bool bad_pmem = false;
....@@ -157,40 +165,31 @@
157165 if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
158166 bad_pmem = true;
159167
160
- if (!op_is_write(op)) {
161
- if (unlikely(bad_pmem))
162
- rc = BLK_STS_IOERR;
163
- else {
164
- rc = read_pmem(page, off, pmem_addr, len);
165
- flush_dcache_page(page);
166
- }
167
- } else {
168
- /*
169
- * Note that we write the data both before and after
170
- * clearing poison. The write before clear poison
171
- * handles situations where the latest written data is
172
- * preserved and the clear poison operation simply marks
173
- * the address range as valid without changing the data.
174
- * In this case application software can assume that an
175
- * interrupted write will either return the new good
176
- * data or an error.
177
- *
178
- * However, if pmem_clear_poison() leaves the data in an
179
- * indeterminate state we need to perform the write
180
- * after clear poison.
181
- */
182
- flush_dcache_page(page);
183
- write_pmem(pmem_addr, page, off, len);
184
- if (unlikely(bad_pmem)) {
185
- rc = pmem_clear_poison(pmem, pmem_off, len);
186
- write_pmem(pmem_addr, page, off, len);
187
- }
168
+ /*
169
+ * Note that we write the data both before and after
170
+ * clearing poison. The write before clear poison
171
+ * handles situations where the latest written data is
172
+ * preserved and the clear poison operation simply marks
173
+ * the address range as valid without changing the data.
174
+ * In this case application software can assume that an
175
+ * interrupted write will either return the new good
176
+ * data or an error.
177
+ *
178
+ * However, if pmem_clear_poison() leaves the data in an
179
+ * indeterminate state we need to perform the write
180
+ * after clear poison.
181
+ */
182
+ flush_dcache_page(page);
183
+ write_pmem(pmem_addr, page, page_off, len);
184
+ if (unlikely(bad_pmem)) {
185
+ rc = pmem_clear_poison(pmem, pmem_off, len);
186
+ write_pmem(pmem_addr, page, page_off, len);
188187 }
189188
190189 return rc;
191190 }
192191
193
-static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
192
+static blk_qc_t pmem_submit_bio(struct bio *bio)
194193 {
195194 int ret = 0;
196195 blk_status_t rc = 0;
....@@ -198,23 +197,29 @@
198197 unsigned long start;
199198 struct bio_vec bvec;
200199 struct bvec_iter iter;
201
- struct pmem_device *pmem = q->queuedata;
200
+ struct pmem_device *pmem = bio->bi_disk->private_data;
202201 struct nd_region *nd_region = to_region(pmem);
203202
204203 if (bio->bi_opf & REQ_PREFLUSH)
205204 ret = nvdimm_flush(nd_region, bio);
206205
207
- do_acct = nd_iostat_start(bio, &start);
206
+ do_acct = blk_queue_io_stat(bio->bi_disk->queue);
207
+ if (do_acct)
208
+ start = bio_start_io_acct(bio);
208209 bio_for_each_segment(bvec, bio, iter) {
209
- rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
210
- bvec.bv_offset, bio_op(bio), iter.bi_sector);
210
+ if (op_is_write(bio_op(bio)))
211
+ rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
212
+ iter.bi_sector, bvec.bv_len);
213
+ else
214
+ rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset,
215
+ iter.bi_sector, bvec.bv_len);
211216 if (rc) {
212217 bio->bi_status = rc;
213218 break;
214219 }
215220 }
216221 if (do_acct)
217
- nd_iostat_end(bio, start);
222
+ bio_end_io_acct(bio, start);
218223
219224 if (bio->bi_opf & REQ_FUA)
220225 ret = nvdimm_flush(nd_region, bio);
....@@ -229,12 +234,13 @@
229234 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
230235 struct page *page, unsigned int op)
231236 {
232
- struct pmem_device *pmem = bdev->bd_queue->queuedata;
237
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
233238 blk_status_t rc;
234239
235
- rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
236
- 0, op, sector);
237
-
240
+ if (op_is_write(op))
241
+ rc = pmem_do_write(pmem, page, 0, sector, thp_size(page));
242
+ else
243
+ rc = pmem_do_read(pmem, page, 0, sector, thp_size(page));
238244 /*
239245 * The ->rw_page interface is subtle and tricky. The core
240246 * retries on any error, so we can only invoke page_endio() in
....@@ -273,9 +279,19 @@
273279
274280 static const struct block_device_operations pmem_fops = {
275281 .owner = THIS_MODULE,
282
+ .submit_bio = pmem_submit_bio,
276283 .rw_page = pmem_rw_page,
277
- .revalidate_disk = nvdimm_revalidate_disk,
278284 };
285
+
286
+static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
287
+ size_t nr_pages)
288
+{
289
+ struct pmem_device *pmem = dax_get_private(dax_dev);
290
+
291
+ return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0,
292
+ PFN_PHYS(pgoff) >> SECTOR_SHIFT,
293
+ PAGE_SIZE));
294
+}
279295
280296 static long pmem_dax_direct_access(struct dax_device *dax_dev,
281297 pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
....@@ -287,7 +303,7 @@
287303
288304 /*
289305 * Use the 'no check' versions of copy_from_iter_flushcache() and
290
- * copy_to_iter_mcsafe() to bypass HARDENED_USERCOPY overhead. Bounds
306
+ * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
291307 * checking, both file offset and device offset, is handled by
292308 * dax_iomap_actor()
293309 */
....@@ -300,13 +316,15 @@
300316 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
301317 void *addr, size_t bytes, struct iov_iter *i)
302318 {
303
- return _copy_to_iter_mcsafe(addr, bytes, i);
319
+ return _copy_mc_to_iter(addr, bytes, i);
304320 }
305321
306322 static const struct dax_operations pmem_dax_ops = {
307323 .direct_access = pmem_dax_direct_access,
324
+ .dax_supported = generic_fsdax_supported,
308325 .copy_from_iter = pmem_copy_from_iter,
309326 .copy_to_iter = pmem_copy_to_iter,
327
+ .zero_page_range = pmem_dax_zero_page_range,
310328 };
311329
312330 static const struct attribute_group *pmem_attribute_groups[] = {
....@@ -314,16 +332,24 @@
314332 NULL,
315333 };
316334
317
-static void pmem_release_queue(void *q)
335
+static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
318336 {
337
+ struct request_queue *q =
338
+ container_of(pgmap->ref, struct request_queue, q_usage_counter);
339
+
319340 blk_cleanup_queue(q);
320341 }
321342
322
-static void pmem_freeze_queue(struct percpu_ref *ref)
343
+static void pmem_release_queue(void *pgmap)
323344 {
324
- struct request_queue *q;
345
+ pmem_pagemap_cleanup(pgmap);
346
+}
325347
326
- q = container_of(ref, typeof(*q), q_usage_counter);
348
+static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
349
+{
350
+ struct request_queue *q =
351
+ container_of(pgmap->ref, struct request_queue, q_usage_counter);
352
+
327353 blk_freeze_queue_start(q);
328354 }
329355
....@@ -337,26 +363,10 @@
337363 put_disk(pmem->disk);
338364 }
339365
340
-static void pmem_release_pgmap_ops(void *__pgmap)
341
-{
342
- dev_pagemap_put_ops();
343
-}
344
-
345
-static void fsdax_pagefree(struct page *page, void *data)
346
-{
347
- wake_up_var(&page->_refcount);
348
-}
349
-
350
-static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
351
-{
352
- dev_pagemap_get_ops();
353
- if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
354
- return -ENOMEM;
355
- pgmap->type = MEMORY_DEVICE_FS_DAX;
356
- pgmap->page_free = fsdax_pagefree;
357
-
358
- return 0;
359
-}
366
+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
367
+ .kill = pmem_pagemap_kill,
368
+ .cleanup = pmem_pagemap_cleanup,
369
+};
360370
361371 static int pmem_attach_disk(struct device *dev,
362372 struct nd_namespace_common *ndns)
....@@ -365,7 +375,7 @@
365375 struct nd_region *nd_region = to_nd_region(dev->parent);
366376 int nid = dev_to_node(dev), fua;
367377 struct resource *res = &nsio->res;
368
- struct resource bb_res;
378
+ struct range bb_range;
369379 struct nd_pfn *nd_pfn = NULL;
370380 struct dax_device *dax_dev;
371381 struct nd_pfn_sb *pfn_sb;
....@@ -375,10 +385,15 @@
375385 struct gendisk *disk;
376386 void *addr;
377387 int rc;
388
+ unsigned long flags = 0UL;
378389
379390 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
380391 if (!pmem)
381392 return -ENOMEM;
393
+
394
+ rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
395
+ if (rc)
396
+ return rc;
382397
383398 /* while nsio_rw_bytes is active, parse a pfn info block if present */
384399 if (is_nd_pfn(dev)) {
....@@ -389,7 +404,7 @@
389404 }
390405
391406 /* we're attaching a block device, disable raw namespace access */
392
- devm_nsio_disable(dev, nsio);
407
+ devm_namespace_disable(dev, ndns);
393408
394409 dev_set_drvdata(dev, pmem);
395410 pmem->phys_addr = res->start;
....@@ -406,39 +421,40 @@
406421 return -EBUSY;
407422 }
408423
409
- q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
424
+ q = blk_alloc_queue(dev_to_node(dev));
410425 if (!q)
411
- return -ENOMEM;
412
-
413
- if (devm_add_action_or_reset(dev, pmem_release_queue, q))
414426 return -ENOMEM;
415427
416428 pmem->pfn_flags = PFN_DEV;
417429 pmem->pgmap.ref = &q->q_usage_counter;
418
- pmem->pgmap.kill = pmem_freeze_queue;
419430 if (is_nd_pfn(dev)) {
420
- if (setup_pagemap_fsdax(dev, &pmem->pgmap))
421
- return -ENOMEM;
431
+ pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
432
+ pmem->pgmap.ops = &fsdax_pagemap_ops;
422433 addr = devm_memremap_pages(dev, &pmem->pgmap);
423434 pfn_sb = nd_pfn->pfn_sb;
424435 pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
425436 pmem->pfn_pad = resource_size(res) -
426
- resource_size(&pmem->pgmap.res);
437
+ range_len(&pmem->pgmap.range);
427438 pmem->pfn_flags |= PFN_MAP;
428
- memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
429
- bb_res.start += pmem->data_offset;
439
+ bb_range = pmem->pgmap.range;
440
+ bb_range.start += pmem->data_offset;
430441 } else if (pmem_should_map_pages(dev)) {
431
- memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
432
- pmem->pgmap.altmap_valid = false;
433
- if (setup_pagemap_fsdax(dev, &pmem->pgmap))
434
- return -ENOMEM;
442
+ pmem->pgmap.range.start = res->start;
443
+ pmem->pgmap.range.end = res->end;
444
+ pmem->pgmap.nr_range = 1;
445
+ pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
446
+ pmem->pgmap.ops = &fsdax_pagemap_ops;
435447 addr = devm_memremap_pages(dev, &pmem->pgmap);
436448 pmem->pfn_flags |= PFN_MAP;
437
- memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
449
+ bb_range = pmem->pgmap.range;
438450 } else {
439451 addr = devm_memremap(dev, pmem->phys_addr,
440452 pmem->size, ARCH_MEMREMAP_PMEM);
441
- memcpy(&bb_res, &nsio->res, sizeof(bb_res));
453
+ if (devm_add_action_or_reset(dev, pmem_release_queue,
454
+ &pmem->pgmap))
455
+ return -ENOMEM;
456
+ bb_range.start = res->start;
457
+ bb_range.end = res->end;
442458 }
443459
444460 if (IS_ERR(addr))
....@@ -446,14 +462,12 @@
446462 pmem->virt_addr = addr;
447463
448464 blk_queue_write_cache(q, true, fua);
449
- blk_queue_make_request(q, pmem_make_request);
450465 blk_queue_physical_block_size(q, PAGE_SIZE);
451466 blk_queue_logical_block_size(q, pmem_sector_size(ndns));
452467 blk_queue_max_hw_sectors(q, UINT_MAX);
453468 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
454469 if (pmem->pfn_flags & PFN_MAP)
455470 blk_queue_flag_set(QUEUE_FLAG_DAX, q);
456
- q->queuedata = pmem;
457471
458472 disk = alloc_disk_node(0, nid);
459473 if (!disk)
....@@ -463,30 +477,32 @@
463477 disk->fops = &pmem_fops;
464478 disk->queue = q;
465479 disk->flags = GENHD_FL_EXT_DEVT;
466
- disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
480
+ disk->private_data = pmem;
467481 nvdimm_namespace_disk_name(ndns, disk->disk_name);
468482 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
469483 / 512);
470484 if (devm_init_badblocks(dev, &pmem->bb))
471485 return -ENOMEM;
472
- nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
486
+ nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range);
473487 disk->bb = &pmem->bb;
474488
475
- dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
476
- if (!dax_dev) {
489
+ if (is_nvdimm_sync(nd_region))
490
+ flags = DAXDEV_F_SYNC;
491
+ dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
492
+ if (IS_ERR(dax_dev)) {
477493 put_disk(disk);
478
- return -ENOMEM;
494
+ return PTR_ERR(dax_dev);
479495 }
480496 dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
481497 pmem->dax_dev = dax_dev;
482498 gendev = disk_to_dev(disk);
483499 gendev->groups = pmem_attribute_groups;
484500
485
- device_add_disk(dev, disk);
501
+ device_add_disk(dev, disk, NULL);
486502 if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
487503 return -ENOMEM;
488504
489
- revalidate_disk(disk);
505
+ nvdimm_check_and_set_ro(disk);
490506
491507 pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
492508 "badblocks");
....@@ -498,14 +514,12 @@
498514
499515 static int nd_pmem_probe(struct device *dev)
500516 {
517
+ int ret;
501518 struct nd_namespace_common *ndns;
502519
503520 ndns = nvdimm_namespace_common_probe(dev);
504521 if (IS_ERR(ndns))
505522 return PTR_ERR(ndns);
506
-
507
- if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
508
- return -ENXIO;
509523
510524 if (is_nd_btt(dev))
511525 return nvdimm_namespace_attach_btt(ndns);
....@@ -513,12 +527,40 @@
513527 if (is_nd_pfn(dev))
514528 return pmem_attach_disk(dev, ndns);
515529
516
- /* if we find a valid info-block we'll come back as that personality */
517
- if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
518
- || nd_dax_probe(dev, ndns) == 0)
530
+ ret = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
531
+ if (ret)
532
+ return ret;
533
+
534
+ ret = nd_btt_probe(dev, ndns);
535
+ if (ret == 0)
519536 return -ENXIO;
520537
521
- /* ...otherwise we're just a raw pmem device */
538
+ /*
539
+ * We have two failure conditions here, there is no
540
+ * info reserver block or we found a valid info reserve block
541
+ * but failed to initialize the pfn superblock.
542
+ *
543
+ * For the first case consider namespace as a raw pmem namespace
544
+ * and attach a disk.
545
+ *
546
+ * For the latter, consider this a success and advance the namespace
547
+ * seed.
548
+ */
549
+ ret = nd_pfn_probe(dev, ndns);
550
+ if (ret == 0)
551
+ return -ENXIO;
552
+ else if (ret == -EOPNOTSUPP)
553
+ return ret;
554
+
555
+ ret = nd_dax_probe(dev, ndns);
556
+ if (ret == 0)
557
+ return -ENXIO;
558
+ else if (ret == -EOPNOTSUPP)
559
+ return ret;
560
+
561
+ /* probe complete, attach handles namespace enabling */
562
+ devm_namespace_disable(dev, ndns);
563
+
522564 return pmem_attach_disk(dev, ndns);
523565 }
524566
....@@ -530,8 +572,8 @@
530572 nvdimm_namespace_detach_btt(to_nd_btt(dev));
531573 else {
532574 /*
533
- * Note, this assumes device_lock() context to not race
534
- * nd_pmem_notify()
575
+ * Note, this assumes nd_device_lock() context to not
576
+ * race nd_pmem_notify()
535577 */
536578 sysfs_put(pmem->bb_state);
537579 pmem->bb_state = NULL;
....@@ -552,8 +594,8 @@
552594 resource_size_t offset = 0, end_trunc = 0;
553595 struct nd_namespace_common *ndns;
554596 struct nd_namespace_io *nsio;
555
- struct resource res;
556597 struct badblocks *bb;
598
+ struct range range;
557599 struct kernfs_node *bb_state;
558600
559601 if (event != NVDIMM_REVALIDATE_POISON)
....@@ -589,9 +631,9 @@
589631 nsio = to_nd_namespace_io(&ndns->dev);
590632 }
591633
592
- res.start = nsio->res.start + offset;
593
- res.end = nsio->res.end - end_trunc;
594
- nvdimm_badblocks_populate(nd_region, bb, &res);
634
+ range.start = nsio->res.start + offset;
635
+ range.end = nsio->res.end - end_trunc;
636
+ nvdimm_badblocks_populate(nd_region, bb, &range);
595637 if (bb_state)
596638 sysfs_notify_dirent(bb_state);
597639 }