~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,21 +1,12 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Persistent Memory Driver
3	4	*
4	5	* Copyright (c) 2014-2015, Intel Corporation.
5	6	* Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
6	7	* Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
7		- *
8		- * This program is free software; you can redistribute it and/or modify it
9		- * under the terms and conditions of the GNU General Public License,
10		- * version 2, as published by the Free Software Foundation.
11		- *
12		- * This program is distributed in the hope it will be useful, but WITHOUT
13		- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14		- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15		- * more details.
16	8	*/
17	9
18		-#include <asm/cacheflush.h>
19	10	#include <linux/blkdev.h>
20	11	#include <linux/hdreg.h>
21	12	#include <linux/init.h>
..	..	@@ -33,10 +24,11 @@
33	24	#include <linux/dax.h>
34	25	#include <linux/nd.h>
35	26	#include <linux/backing-dev.h>
	27	+#include <linux/mm.h>
	28	+#include <asm/cacheflush.h>
36	29	#include "pmem.h"
37	30	#include "pfn.h"
38	31	#include "nd.h"
39		-#include "nd-core.h"
40	32
41	33	static struct device to_dev(struct pmem_device pmem)
42	34	{
..	..	@@ -133,7 +125,7 @@
133	125	while (len) {
134	126	mem = kmap_atomic(page);
135	127	chunk = min_t(unsigned int, len, PAGE_SIZE - off);
136		- rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
	128	+ rem = copy_mc_to_kernel(mem + off, pmem_addr, chunk);
137	129	kunmap_atomic(mem);
138	130	if (rem)
139	131	return BLK_STS_IOERR;
..	..	@@ -145,9 +137,25 @@
145	137	return BLK_STS_OK;
146	138	}
147	139
148		-static blk_status_t pmem_do_bvec(struct pmem_device pmem, struct page page,
149		- unsigned int len, unsigned int off, unsigned int op,
150		- sector_t sector)
	140	+static blk_status_t pmem_do_read(struct pmem_device *pmem,
	141	+ struct page *page, unsigned int page_off,
	142	+ sector_t sector, unsigned int len)
	143	+{
	144	+ blk_status_t rc;
	145	+ phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
	146	+ void *pmem_addr = pmem->virt_addr + pmem_off;
	147	+
	148	+ if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
	149	+ return BLK_STS_IOERR;
	150	+
	151	+ rc = read_pmem(page, page_off, pmem_addr, len);
	152	+ flush_dcache_page(page);
	153	+ return rc;
	154	+}
	155	+
	156	+static blk_status_t pmem_do_write(struct pmem_device *pmem,
	157	+ struct page *page, unsigned int page_off,
	158	+ sector_t sector, unsigned int len)
151	159	{
152	160	blk_status_t rc = BLK_STS_OK;
153	161	bool bad_pmem = false;
..	..	@@ -157,40 +165,31 @@
157	165	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
158	166	bad_pmem = true;
159	167
160		- if (!op_is_write(op)) {
161		- if (unlikely(bad_pmem))
162		- rc = BLK_STS_IOERR;
163		- else {
164		- rc = read_pmem(page, off, pmem_addr, len);
165		- flush_dcache_page(page);
166		- }
167		- } else {
168		- /*
169		- * Note that we write the data both before and after
170		- * clearing poison. The write before clear poison
171		- * handles situations where the latest written data is
172		- * preserved and the clear poison operation simply marks
173		- * the address range as valid without changing the data.
174		- * In this case application software can assume that an
175		- * interrupted write will either return the new good
176		- * data or an error.
177		- *
178		- * However, if pmem_clear_poison() leaves the data in an
179		- * indeterminate state we need to perform the write
180		- * after clear poison.
181		- */
182		- flush_dcache_page(page);
183		- write_pmem(pmem_addr, page, off, len);
184		- if (unlikely(bad_pmem)) {
185		- rc = pmem_clear_poison(pmem, pmem_off, len);
186		- write_pmem(pmem_addr, page, off, len);
187		- }
	168	+ /*
	169	+ * Note that we write the data both before and after
	170	+ * clearing poison. The write before clear poison
	171	+ * handles situations where the latest written data is
	172	+ * preserved and the clear poison operation simply marks
	173	+ * the address range as valid without changing the data.
	174	+ * In this case application software can assume that an
	175	+ * interrupted write will either return the new good
	176	+ * data or an error.
	177	+ *
	178	+ * However, if pmem_clear_poison() leaves the data in an
	179	+ * indeterminate state we need to perform the write
	180	+ * after clear poison.
	181	+ */
	182	+ flush_dcache_page(page);
	183	+ write_pmem(pmem_addr, page, page_off, len);
	184	+ if (unlikely(bad_pmem)) {
	185	+ rc = pmem_clear_poison(pmem, pmem_off, len);
	186	+ write_pmem(pmem_addr, page, page_off, len);
188	187	}
189	188
190	189	return rc;
191	190	}
192	191
193		-static blk_qc_t pmem_make_request(struct request_queue q, struct bio bio)
	192	+static blk_qc_t pmem_submit_bio(struct bio *bio)
194	193	{
195	194	int ret = 0;
196	195	blk_status_t rc = 0;
..	..	@@ -198,23 +197,29 @@
198	197	unsigned long start;
199	198	struct bio_vec bvec;
200	199	struct bvec_iter iter;
201		- struct pmem_device *pmem = q->queuedata;
	200	+ struct pmem_device *pmem = bio->bi_disk->private_data;
202	201	struct nd_region *nd_region = to_region(pmem);
203	202
204	203	if (bio->bi_opf & REQ_PREFLUSH)
205	204	ret = nvdimm_flush(nd_region, bio);
206	205
207		- do_acct = nd_iostat_start(bio, &start);
	206	+ do_acct = blk_queue_io_stat(bio->bi_disk->queue);
	207	+ if (do_acct)
	208	+ start = bio_start_io_acct(bio);
208	209	bio_for_each_segment(bvec, bio, iter) {
209		- rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
210		- bvec.bv_offset, bio_op(bio), iter.bi_sector);
	210	+ if (op_is_write(bio_op(bio)))
	211	+ rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
	212	+ iter.bi_sector, bvec.bv_len);
	213	+ else
	214	+ rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset,
	215	+ iter.bi_sector, bvec.bv_len);
211	216	if (rc) {
212	217	bio->bi_status = rc;
213	218	break;
214	219	}
215	220	}
216	221	if (do_acct)
217		- nd_iostat_end(bio, start);
	222	+ bio_end_io_acct(bio, start);
218	223
219	224	if (bio->bi_opf & REQ_FUA)
220	225	ret = nvdimm_flush(nd_region, bio);
..	..	@@ -229,12 +234,13 @@
229	234	static int pmem_rw_page(struct block_device *bdev, sector_t sector,
230	235	struct page *page, unsigned int op)
231	236	{
232		- struct pmem_device *pmem = bdev->bd_queue->queuedata;
	237	+ struct pmem_device *pmem = bdev->bd_disk->private_data;
233	238	blk_status_t rc;
234	239
235		- rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
236		- 0, op, sector);
237		-
	240	+ if (op_is_write(op))
	241	+ rc = pmem_do_write(pmem, page, 0, sector, thp_size(page));
	242	+ else
	243	+ rc = pmem_do_read(pmem, page, 0, sector, thp_size(page));
238	244	/*
239	245	* The ->rw_page interface is subtle and tricky. The core
240	246	* retries on any error, so we can only invoke page_endio() in
..	..	@@ -273,9 +279,19 @@
273	279
274	280	static const struct block_device_operations pmem_fops = {
275	281	.owner = THIS_MODULE,
	282	+ .submit_bio = pmem_submit_bio,
276	283	.rw_page = pmem_rw_page,
277		- .revalidate_disk = nvdimm_revalidate_disk,
278	284	};
	285	+
	286	+static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
	287	+ size_t nr_pages)
	288	+{
	289	+ struct pmem_device *pmem = dax_get_private(dax_dev);
	290	+
	291	+ return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0,
	292	+ PFN_PHYS(pgoff) >> SECTOR_SHIFT,
	293	+ PAGE_SIZE));
	294	+}
279	295
280	296	static long pmem_dax_direct_access(struct dax_device *dax_dev,
281	297	pgoff_t pgoff, long nr_pages, void *kaddr, pfn_t pfn)
..	..	@@ -287,7 +303,7 @@
287	303
288	304	/*
289	305	* Use the 'no check' versions of copy_from_iter_flushcache() and
290		- * copy_to_iter_mcsafe() to bypass HARDENED_USERCOPY overhead. Bounds
	306	+ * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
291	307	* checking, both file offset and device offset, is handled by
292	308	* dax_iomap_actor()
293	309	*/
..	..	@@ -300,13 +316,15 @@
300	316	static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
301	317	void addr, size_t bytes, struct iov_iter i)
302	318	{
303		- return _copy_to_iter_mcsafe(addr, bytes, i);
	319	+ return _copy_mc_to_iter(addr, bytes, i);
304	320	}
305	321
306	322	static const struct dax_operations pmem_dax_ops = {
307	323	.direct_access = pmem_dax_direct_access,
	324	+ .dax_supported = generic_fsdax_supported,
308	325	.copy_from_iter = pmem_copy_from_iter,
309	326	.copy_to_iter = pmem_copy_to_iter,
	327	+ .zero_page_range = pmem_dax_zero_page_range,
310	328	};
311	329
312	330	static const struct attribute_group *pmem_attribute_groups[] = {
..	..	@@ -314,16 +332,24 @@
314	332	NULL,
315	333	};
316	334
317		-static void pmem_release_queue(void *q)
	335	+static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
318	336	{
	337	+ struct request_queue *q =
	338	+ container_of(pgmap->ref, struct request_queue, q_usage_counter);
	339	+
319	340	blk_cleanup_queue(q);
320	341	}
321	342
322		-static void pmem_freeze_queue(struct percpu_ref *ref)
	343	+static void pmem_release_queue(void *pgmap)
323	344	{
324		- struct request_queue *q;
	345	+ pmem_pagemap_cleanup(pgmap);
	346	+}
325	347
326		- q = container_of(ref, typeof(*q), q_usage_counter);
	348	+static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
	349	+{
	350	+ struct request_queue *q =
	351	+ container_of(pgmap->ref, struct request_queue, q_usage_counter);
	352	+
327	353	blk_freeze_queue_start(q);
328	354	}
329	355
..	..	@@ -337,26 +363,10 @@
337	363	put_disk(pmem->disk);
338	364	}
339	365
340		-static void pmem_release_pgmap_ops(void *__pgmap)
341		-{
342		- dev_pagemap_put_ops();
343		-}
344		-
345		-static void fsdax_pagefree(struct page page, void data)
346		-{
347		- wake_up_var(&page->_refcount);
348		-}
349		-
350		-static int setup_pagemap_fsdax(struct device dev, struct dev_pagemap pgmap)
351		-{
352		- dev_pagemap_get_ops();
353		- if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
354		- return -ENOMEM;
355		- pgmap->type = MEMORY_DEVICE_FS_DAX;
356		- pgmap->page_free = fsdax_pagefree;
357		-
358		- return 0;
359		-}
	366	+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
	367	+ .kill = pmem_pagemap_kill,
	368	+ .cleanup = pmem_pagemap_cleanup,
	369	+};
360	370
361	371	static int pmem_attach_disk(struct device *dev,
362	372	struct nd_namespace_common *ndns)
..	..	@@ -365,7 +375,7 @@
365	375	struct nd_region *nd_region = to_nd_region(dev->parent);
366	376	int nid = dev_to_node(dev), fua;
367	377	struct resource *res = &nsio->res;
368		- struct resource bb_res;
	378	+ struct range bb_range;
369	379	struct nd_pfn *nd_pfn = NULL;
370	380	struct dax_device *dax_dev;
371	381	struct nd_pfn_sb *pfn_sb;
..	..	@@ -375,10 +385,15 @@
375	385	struct gendisk *disk;
376	386	void *addr;
377	387	int rc;
	388	+ unsigned long flags = 0UL;
378	389
379	390	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
380	391	if (!pmem)
381	392	return -ENOMEM;
	393	+
	394	+ rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
	395	+ if (rc)
	396	+ return rc;
382	397
383	398	/* while nsio_rw_bytes is active, parse a pfn info block if present */
384	399	if (is_nd_pfn(dev)) {
..	..	@@ -389,7 +404,7 @@
389	404	}
390	405
391	406	/* we're attaching a block device, disable raw namespace access */
392		- devm_nsio_disable(dev, nsio);
	407	+ devm_namespace_disable(dev, ndns);
393	408
394	409	dev_set_drvdata(dev, pmem);
395	410	pmem->phys_addr = res->start;
..	..	@@ -406,39 +421,40 @@
406	421	return -EBUSY;
407	422	}
408	423
409		- q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
	424	+ q = blk_alloc_queue(dev_to_node(dev));
410	425	if (!q)
411		- return -ENOMEM;
412		-
413		- if (devm_add_action_or_reset(dev, pmem_release_queue, q))
414	426	return -ENOMEM;
415	427
416	428	pmem->pfn_flags = PFN_DEV;
417	429	pmem->pgmap.ref = &q->q_usage_counter;
418		- pmem->pgmap.kill = pmem_freeze_queue;
419	430	if (is_nd_pfn(dev)) {
420		- if (setup_pagemap_fsdax(dev, &pmem->pgmap))
421		- return -ENOMEM;
	431	+ pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
	432	+ pmem->pgmap.ops = &fsdax_pagemap_ops;
422	433	addr = devm_memremap_pages(dev, &pmem->pgmap);
423	434	pfn_sb = nd_pfn->pfn_sb;
424	435	pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
425	436	pmem->pfn_pad = resource_size(res) -
426		- resource_size(&pmem->pgmap.res);
	437	+ range_len(&pmem->pgmap.range);
427	438	pmem->pfn_flags \|= PFN_MAP;
428		- memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
429		- bb_res.start += pmem->data_offset;
	439	+ bb_range = pmem->pgmap.range;
	440	+ bb_range.start += pmem->data_offset;
430	441	} else if (pmem_should_map_pages(dev)) {
431		- memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
432		- pmem->pgmap.altmap_valid = false;
433		- if (setup_pagemap_fsdax(dev, &pmem->pgmap))
434		- return -ENOMEM;
	442	+ pmem->pgmap.range.start = res->start;
	443	+ pmem->pgmap.range.end = res->end;
	444	+ pmem->pgmap.nr_range = 1;
	445	+ pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
	446	+ pmem->pgmap.ops = &fsdax_pagemap_ops;
435	447	addr = devm_memremap_pages(dev, &pmem->pgmap);
436	448	pmem->pfn_flags \|= PFN_MAP;
437		- memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
	449	+ bb_range = pmem->pgmap.range;
438	450	} else {
439	451	addr = devm_memremap(dev, pmem->phys_addr,
440	452	pmem->size, ARCH_MEMREMAP_PMEM);
441		- memcpy(&bb_res, &nsio->res, sizeof(bb_res));
	453	+ if (devm_add_action_or_reset(dev, pmem_release_queue,
	454	+ &pmem->pgmap))
	455	+ return -ENOMEM;
	456	+ bb_range.start = res->start;
	457	+ bb_range.end = res->end;
442	458	}
443	459
444	460	if (IS_ERR(addr))
..	..	@@ -446,14 +462,12 @@
446	462	pmem->virt_addr = addr;
447	463
448	464	blk_queue_write_cache(q, true, fua);
449		- blk_queue_make_request(q, pmem_make_request);
450	465	blk_queue_physical_block_size(q, PAGE_SIZE);
451	466	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
452	467	blk_queue_max_hw_sectors(q, UINT_MAX);
453	468	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
454	469	if (pmem->pfn_flags & PFN_MAP)
455	470	blk_queue_flag_set(QUEUE_FLAG_DAX, q);
456		- q->queuedata = pmem;
457	471
458	472	disk = alloc_disk_node(0, nid);
459	473	if (!disk)
..	..	@@ -463,30 +477,32 @@
463	477	disk->fops = &pmem_fops;
464	478	disk->queue = q;
465	479	disk->flags = GENHD_FL_EXT_DEVT;
466		- disk->queue->backing_dev_info->capabilities \|= BDI_CAP_SYNCHRONOUS_IO;
	480	+ disk->private_data = pmem;
467	481	nvdimm_namespace_disk_name(ndns, disk->disk_name);
468	482	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
469	483	/ 512);
470	484	if (devm_init_badblocks(dev, &pmem->bb))
471	485	return -ENOMEM;
472		- nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
	486	+ nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range);
473	487	disk->bb = &pmem->bb;
474	488
475		- dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
476		- if (!dax_dev) {
	489	+ if (is_nvdimm_sync(nd_region))
	490	+ flags = DAXDEV_F_SYNC;
	491	+ dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
	492	+ if (IS_ERR(dax_dev)) {
477	493	put_disk(disk);
478		- return -ENOMEM;
	494	+ return PTR_ERR(dax_dev);
479	495	}
480	496	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
481	497	pmem->dax_dev = dax_dev;
482	498	gendev = disk_to_dev(disk);
483	499	gendev->groups = pmem_attribute_groups;
484	500
485		- device_add_disk(dev, disk);
	501	+ device_add_disk(dev, disk, NULL);
486	502	if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
487	503	return -ENOMEM;
488	504
489		- revalidate_disk(disk);
	505	+ nvdimm_check_and_set_ro(disk);
490	506
491	507	pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
492	508	"badblocks");
..	..	@@ -498,14 +514,12 @@
498	514
499	515	static int nd_pmem_probe(struct device *dev)
500	516	{
	517	+ int ret;
501	518	struct nd_namespace_common *ndns;
502	519
503	520	ndns = nvdimm_namespace_common_probe(dev);
504	521	if (IS_ERR(ndns))
505	522	return PTR_ERR(ndns);
506		-
507		- if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
508		- return -ENXIO;
509	523
510	524	if (is_nd_btt(dev))
511	525	return nvdimm_namespace_attach_btt(ndns);
..	..	@@ -513,12 +527,40 @@
513	527	if (is_nd_pfn(dev))
514	528	return pmem_attach_disk(dev, ndns);
515	529
516		- /* if we find a valid info-block we'll come back as that personality */
517		- if (nd_btt_probe(dev, ndns) == 0 \|\| nd_pfn_probe(dev, ndns) == 0
518		- \|\| nd_dax_probe(dev, ndns) == 0)
	530	+ ret = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
	531	+ if (ret)
	532	+ return ret;
	533	+
	534	+ ret = nd_btt_probe(dev, ndns);
	535	+ if (ret == 0)
519	536	return -ENXIO;
520	537
521		- /* ...otherwise we're just a raw pmem device */
	538	+ /*
	539	+ * We have two failure conditions here, there is no
	540	+ * info reserver block or we found a valid info reserve block
	541	+ * but failed to initialize the pfn superblock.
	542	+ *
	543	+ * For the first case consider namespace as a raw pmem namespace
	544	+ * and attach a disk.
	545	+ *
	546	+ * For the latter, consider this a success and advance the namespace
	547	+ * seed.
	548	+ */
	549	+ ret = nd_pfn_probe(dev, ndns);
	550	+ if (ret == 0)
	551	+ return -ENXIO;
	552	+ else if (ret == -EOPNOTSUPP)
	553	+ return ret;
	554	+
	555	+ ret = nd_dax_probe(dev, ndns);
	556	+ if (ret == 0)
	557	+ return -ENXIO;
	558	+ else if (ret == -EOPNOTSUPP)
	559	+ return ret;
	560	+
	561	+ /* probe complete, attach handles namespace enabling */
	562	+ devm_namespace_disable(dev, ndns);
	563	+
522	564	return pmem_attach_disk(dev, ndns);
523	565	}
524	566
..	..	@@ -530,8 +572,8 @@
530	572	nvdimm_namespace_detach_btt(to_nd_btt(dev));
531	573	else {
532	574	/*
533		- * Note, this assumes device_lock() context to not race
534		- * nd_pmem_notify()
	575	+ * Note, this assumes nd_device_lock() context to not
	576	+ * race nd_pmem_notify()
535	577	*/
536	578	sysfs_put(pmem->bb_state);
537	579	pmem->bb_state = NULL;
..	..	@@ -552,8 +594,8 @@
552	594	resource_size_t offset = 0, end_trunc = 0;
553	595	struct nd_namespace_common *ndns;
554	596	struct nd_namespace_io *nsio;
555		- struct resource res;
556	597	struct badblocks *bb;
	598	+ struct range range;
557	599	struct kernfs_node *bb_state;
558	600
559	601	if (event != NVDIMM_REVALIDATE_POISON)
..	..	@@ -589,9 +631,9 @@
589	631	nsio = to_nd_namespace_io(&ndns->dev);
590	632	}
591	633
592		- res.start = nsio->res.start + offset;
593		- res.end = nsio->res.end - end_trunc;
594		- nvdimm_badblocks_populate(nd_region, bb, &res);
	634	+ range.start = nsio->res.start + offset;
	635	+ range.end = nsio->res.end - end_trunc;
	636	+ nvdimm_badblocks_populate(nd_region, bb, &range);
595	637	if (bb_state)
596	638	sysfs_notify_dirent(bb_state);
597	639	}