~hc/RK356X_SDK_RELEASE.git

..	..	@@ -11,21 +11,12 @@
11	11	#include "xfs_trans_resv.h"
12	12	#include "xfs_mount.h"
13	13	#include "xfs_defer.h"
14		-#include "xfs_da_format.h"
15		-#include "xfs_da_btree.h"
16	14	#include "xfs_inode.h"
17	15	#include "xfs_trans.h"
18		-#include "xfs_inode_item.h"
19	16	#include "xfs_bmap.h"
20	17	#include "xfs_bmap_util.h"
21		-#include "xfs_error.h"
22		-#include "xfs_dir2.h"
23		-#include "xfs_dir2_priv.h"
24		-#include "xfs_ioctl.h"
25	18	#include "xfs_trace.h"
26		-#include "xfs_log.h"
27	19	#include "xfs_icache.h"
28		-#include "xfs_pnfs.h"
29	20	#include "xfs_btree.h"
30	21	#include "xfs_refcount_btree.h"
31	22	#include "xfs_refcount.h"
..	..	@@ -33,11 +24,9 @@
33	24	#include "xfs_trans_space.h"
34	25	#include "xfs_bit.h"
35	26	#include "xfs_alloc.h"
36		-#include "xfs_quota_defs.h"
37	27	#include "xfs_quota.h"
38	28	#include "xfs_reflink.h"
39	29	#include "xfs_iomap.h"
40		-#include "xfs_rmap_btree.h"
41	30	#include "xfs_sb.h"
42	31	#include "xfs_ag_resv.h"
43	32
..	..	@@ -154,8 +143,6 @@
154	143	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
155	144	if (error)
156	145	return error;
157		- if (!agbp)
158		- return -ENOMEM;
159	146
160	147	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
161	148
..	..	@@ -182,8 +169,7 @@
182	169	xfs_reflink_trim_around_shared(
183	170	struct xfs_inode *ip,
184	171	struct xfs_bmbt_irec *irec,
185		- bool *shared,
186		- bool *trimmed)
	172	+ bool *shared)
187	173	{
188	174	xfs_agnumber_t agno;
189	175	xfs_agblock_t agbno;
..	..	@@ -193,7 +179,7 @@
193	179	int error = 0;
194	180
195	181	/* Holes, unwritten, and delalloc extents cannot be shared */
196		- if (!xfs_is_reflink_inode(ip) \|\| !xfs_bmap_is_real_extent(irec)) {
	182	+ if (!xfs_is_cow_inode(ip) \|\| !xfs_bmap_is_written_extent(irec)) {
197	183	*shared = false;
198	184	return 0;
199	185	}
..	..	@@ -209,7 +195,7 @@
209	195	if (error)
210	196	return error;
211	197
212		- shared = trimmed = false;
	198	+ *shared = false;
213	199	if (fbno == NULLAGBLOCK) {
214	200	/* No shared blocks at all. */
215	201	return 0;
..	..	@@ -222,8 +208,6 @@
222	208	*/
223	209	irec->br_blockcount = flen;
224	210	*shared = true;
225		- if (flen != aglen)
226		- *trimmed = true;
227	211	return 0;
228	212	} else {
229	213	/*
..	..	@@ -233,100 +217,63 @@
233	217	* start of the shared region.
234	218	*/
235	219	irec->br_blockcount = fbno - agbno;
236		- *trimmed = true;
237	220	return 0;
238	221	}
239	222	}
240	223
241		-/*
242		- * Trim the passed in imap to the next shared/unshared extent boundary, and
243		- * if imap->br_startoff points to a shared extent reserve space for it in the
244		- * COW fork. In this case *shared is set to true, else to false.
245		- *
246		- * Note that imap will always contain the block numbers for the existing blocks
247		- * in the data fork, as the upper layers need them for read-modify-write
248		- * operations.
249		- */
250	224	int
251		-xfs_reflink_reserve_cow(
	225	+xfs_bmap_trim_cow(
252	226	struct xfs_inode *ip,
253	227	struct xfs_bmbt_irec *imap,
254	228	bool *shared)
255	229	{
256		- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
257		- struct xfs_bmbt_irec got;
258		- int error = 0;
259		- bool eof = false, trimmed;
260		- struct xfs_iext_cursor icur;
261		-
262		- /*
263		- * Search the COW fork extent list first. This serves two purposes:
264		- * first this implement the speculative preallocation using cowextisze,
265		- * so that we also unshared block adjacent to shared blocks instead
266		- * of just the shared blocks themselves. Second the lookup in the
267		- * extent list is generally faster than going out to the shared extent
268		- * tree.
269		- */
270		-
271		- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
272		- eof = true;
273		- if (!eof && got.br_startoff <= imap->br_startoff) {
274		- trace_xfs_reflink_cow_found(ip, imap);
275		- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
276		-
	230	+ /* We can't update any real extents in always COW mode. */
	231	+ if (xfs_is_always_cow_inode(ip) &&
	232	+ !isnullstartblock(imap->br_startblock)) {
277	233	*shared = true;
278	234	return 0;
279	235	}
280	236
281	237	/* Trim the mapping to the nearest shared extent boundary. */
282		- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
283		- if (error)
284		- return error;
285		-
286		- /* Not shared? Just report the (potentially capped) extent. */
287		- if (!*shared)
288		- return 0;
289		-
290		- /*
291		- * Fork all the shared blocks from our write offset until the end of
292		- * the extent.
293		- */
294		- error = xfs_qm_dqattach_locked(ip, false);
295		- if (error)
296		- return error;
297		-
298		- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
299		- imap->br_blockcount, 0, &got, &icur, eof);
300		- if (error == -ENOSPC \|\| error == -EDQUOT)
301		- trace_xfs_reflink_cow_enospc(ip, imap);
302		- if (error)
303		- return error;
304		-
305		- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
306		- trace_xfs_reflink_cow_alloc(ip, &got);
307		- return 0;
	238	+ return xfs_reflink_trim_around_shared(ip, imap, shared);
308	239	}
309	240
310		-/* Convert part of an unwritten CoW extent to a real one. */
311		-STATIC int
312		-xfs_reflink_convert_cow_extent(
313		- struct xfs_inode *ip,
314		- struct xfs_bmbt_irec *imap,
315		- xfs_fileoff_t offset_fsb,
316		- xfs_filblks_t count_fsb)
	241	+static int
	242	+xfs_reflink_convert_cow_locked(
	243	+ struct xfs_inode *ip,
	244	+ xfs_fileoff_t offset_fsb,
	245	+ xfs_filblks_t count_fsb)
317	246	{
318		- int nimaps = 1;
	247	+ struct xfs_iext_cursor icur;
	248	+ struct xfs_bmbt_irec got;
	249	+ struct xfs_btree_cur *dummy_cur = NULL;
	250	+ int dummy_logflags;
	251	+ int error = 0;
319	252
320		- if (imap->br_state == XFS_EXT_NORM)
	253	+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
321	254	return 0;
322	255
323		- xfs_trim_extent(imap, offset_fsb, count_fsb);
324		- trace_xfs_reflink_convert_cow(ip, imap);
325		- if (imap->br_blockcount == 0)
326		- return 0;
327		- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
328		- XFS_BMAPI_COWFORK \| XFS_BMAPI_CONVERT, 0, imap,
329		- &nimaps);
	256	+ do {
	257	+ if (got.br_startoff >= offset_fsb + count_fsb)
	258	+ break;
	259	+ if (got.br_state == XFS_EXT_NORM)
	260	+ continue;
	261	+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
	262	+ return -EIO;
	263	+
	264	+ xfs_trim_extent(&got, offset_fsb, count_fsb);
	265	+ if (!got.br_blockcount)
	266	+ continue;
	267	+
	268	+ got.br_state = XFS_EXT_NORM;
	269	+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
	270	+ XFS_COW_FORK, &icur, &dummy_cur, &got,
	271	+ &dummy_logflags);
	272	+ if (error)
	273	+ return error;
	274	+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
	275	+
	276	+ return error;
330	277	}
331	278
332	279	/* Convert all of the unwritten CoW extents in a file's range to real ones. */
..	..	@@ -340,15 +287,12 @@
340	287	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
341	288	xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
342	289	xfs_filblks_t count_fsb = end_fsb - offset_fsb;
343		- struct xfs_bmbt_irec imap;
344		- int nimaps = 1, error = 0;
	290	+ int error;
345	291
346	292	ASSERT(count != 0);
347	293
348	294	xfs_ilock(ip, XFS_ILOCK_EXCL);
349		- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
350		- XFS_BMAPI_COWFORK \| XFS_BMAPI_CONVERT \|
351		- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
	295	+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
352	296	xfs_iunlock(ip, XFS_ILOCK_EXCL);
353	297	return error;
354	298	}
..	..	@@ -362,14 +306,13 @@
362	306	xfs_find_trim_cow_extent(
363	307	struct xfs_inode *ip,
364	308	struct xfs_bmbt_irec *imap,
	309	+ struct xfs_bmbt_irec *cmap,
365	310	bool *shared,
366	311	bool *found)
367	312	{
368	313	xfs_fileoff_t offset_fsb = imap->br_startoff;
369	314	xfs_filblks_t count_fsb = imap->br_blockcount;
370	315	struct xfs_iext_cursor icur;
371		- struct xfs_bmbt_irec got;
372		- bool trimmed;
373	316
374	317	*found = false;
375	318
..	..	@@ -377,19 +320,22 @@
377	320	* If we don't find an overlapping extent, trim the range we need to
378	321	* allocate to fit the hole we found.
379	322	*/
380		- if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) \|\|
381		- got.br_startoff > offset_fsb)
382		- return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
	323	+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
	324	+ cmap->br_startoff = offset_fsb + count_fsb;
	325	+ if (cmap->br_startoff > offset_fsb) {
	326	+ xfs_trim_extent(imap, imap->br_startoff,
	327	+ cmap->br_startoff - imap->br_startoff);
	328	+ return xfs_bmap_trim_cow(ip, imap, shared);
	329	+ }
383	330
384	331	*shared = true;
385		- if (isnullstartblock(got.br_startblock)) {
386		- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
	332	+ if (isnullstartblock(cmap->br_startblock)) {
	333	+ xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
387	334	return 0;
388	335	}
389	336
390	337	/* real extent found - no need to allocate */
391		- xfs_trim_extent(&got, offset_fsb, count_fsb);
392		- *imap = got;
	338	+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
393	339	*found = true;
394	340	return 0;
395	341	}
..	..	@@ -399,8 +345,10 @@
399	345	xfs_reflink_allocate_cow(
400	346	struct xfs_inode *ip,
401	347	struct xfs_bmbt_irec *imap,
	348	+ struct xfs_bmbt_irec *cmap,
402	349	bool *shared,
403		- uint *lockmode)
	350	+ uint *lockmode,
	351	+ bool convert_now)
404	352	{
405	353	struct xfs_mount *mp = ip->i_mount;
406	354	xfs_fileoff_t offset_fsb = imap->br_startoff;
..	..	@@ -412,9 +360,12 @@
412	360	xfs_extlen_t resblks = 0;
413	361
414	362	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
415		- ASSERT(xfs_is_reflink_inode(ip));
	363	+ if (!ip->i_cowfp) {
	364	+ ASSERT(!xfs_is_reflink_inode(ip));
	365	+ xfs_ifork_init_cow(ip);
	366	+ }
416	367
417		- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
	368	+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
418	369	if (error \|\| !*shared)
419	370	return error;
420	371	if (found)
..	..	@@ -439,7 +390,7 @@
439	390	/*
440	391	* Check for an overlapping extent again now that we dropped the ilock.
441	392	*/
442		- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
	393	+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
443	394	if (error \|\| !*shared)
444	395	goto out_trans_cancel;
445	396	if (found) {
..	..	@@ -457,8 +408,8 @@
457	408	/* Allocate the entire reservation as unwritten blocks. */
458	409	nimaps = 1;
459	410	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
460		- XFS_BMAPI_COWFORK \| XFS_BMAPI_PREALLOC,
461		- resblks, imap, &nimaps);
	411	+ XFS_BMAPI_COWFORK \| XFS_BMAPI_PREALLOC, 0, cmap,
	412	+ &nimaps);
462	413	if (error)
463	414	goto out_unreserve;
464	415
..	..	@@ -474,7 +425,16 @@
474	425	if (nimaps == 0)
475	426	return -ENOSPC;
476	427	convert:
477		- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
	428	+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
	429	+ /*
	430	+ * COW fork extents are supposed to remain unwritten until we're ready
	431	+ * to initiate a disk write. For direct I/O we are going to write the
	432	+ * data and need the conversion, but for buffered writes we're done.
	433	+ */
	434	+ if (!convert_now \|\| cmap->br_state == XFS_EXT_NORM)
	435	+ return 0;
	436	+ trace_xfs_reflink_convert_cow(ip, cmap);
	437	+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
478	438
479	439	out_unreserve:
480	440	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
..	..	@@ -533,10 +493,8 @@
533	493	ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
534	494
535	495	/* Free the CoW orphan record. */
536		- error = xfs_refcount_free_cow_extent(*tpp,
537		- del.br_startblock, del.br_blockcount);
538		- if (error)
539		- break;
	496	+ xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
	497	+ del.br_blockcount);
540	498
541	499	xfs_bmap_add_free(*tpp, del.br_startblock,
542	500	del.br_blockcount, NULL);
..	..	@@ -589,7 +547,7 @@
589	547	int error;
590	548
591	549	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
592		- ASSERT(xfs_is_reflink_inode(ip));
	550	+ ASSERT(ip->i_cowfp);
593	551
594	552	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
595	553	if (count == NULLFILEOFF)
..	..	@@ -599,7 +557,7 @@
599	557
600	558	/* Start a rolling transaction to remove the mappings */
601	559	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
602		- 0, 0, XFS_TRANS_NOFS, &tp);
	560	+ 0, 0, 0, &tp);
603	561	if (error)
604	562	goto out;
605	563
..	..	@@ -626,54 +584,47 @@
626	584	}
627	585
628	586	/*
629		- * Remap parts of a file's data fork after a successful CoW.
	587	+ * Remap part of the CoW fork into the data fork.
	588	+ *
	589	+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
	590	+ * into the data fork; this function will remap what it can (at the end of the
	591	+ * range) and update @end_fsb appropriately. Each remap gets its own
	592	+ * transaction because we can end up merging and splitting bmbt blocks for
	593	+ * every remap operation and we'd like to keep the block reservation
	594	+ * requirements as low as possible.
630	595	*/
631		-int
632		-xfs_reflink_end_cow(
633		- struct xfs_inode *ip,
634		- xfs_off_t offset,
635		- xfs_off_t count)
	596	+STATIC int
	597	+xfs_reflink_end_cow_extent(
	598	+ struct xfs_inode *ip,
	599	+ xfs_fileoff_t offset_fsb,
	600	+ xfs_fileoff_t *end_fsb)
636	601	{
637		- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
638		- struct xfs_bmbt_irec got, del;
639		- struct xfs_trans *tp;
640		- xfs_fileoff_t offset_fsb;
641		- xfs_fileoff_t end_fsb;
642		- int error;
643		- unsigned int resblks;
644		- xfs_filblks_t rlen;
645		- struct xfs_iext_cursor icur;
646		-
647		- trace_xfs_reflink_end_cow(ip, offset, count);
	602	+ struct xfs_bmbt_irec got, del;
	603	+ struct xfs_iext_cursor icur;
	604	+ struct xfs_mount *mp = ip->i_mount;
	605	+ struct xfs_trans *tp;
	606	+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
	607	+ xfs_filblks_t rlen;
	608	+ unsigned int resblks;
	609	+ int error;
648	610
649	611	/* No COW extents? That's easy! */
650		- if (ifp->if_bytes == 0)
	612	+ if (ifp->if_bytes == 0) {
	613	+ *end_fsb = offset_fsb;
651	614	return 0;
	615	+ }
652	616
653		- offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
654		- end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
	617	+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
	618	+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
	619	+ XFS_TRANS_RESERVE, &tp);
	620	+ if (error)
	621	+ return error;
655	622
656	623	/*
657		- * Start a rolling transaction to switch the mappings. We're
658		- * unlikely ever to have to remap 16T worth of single-block
659		- * extents, so just cap the worst case extent count to 2^32-1.
660		- * Stick a warning in just in case, and avoid 64-bit division.
	624	+ * Lock the inode. We have to ijoin without automatic unlock because
	625	+ * the lead transaction is the refcountbt record deletion; the data
	626	+ * fork update follows as a deferred log item.
661	627	*/
662		- BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
663		- if (end_fsb - offset_fsb > UINT_MAX) {
664		- error = -EFSCORRUPTED;
665		- xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
666		- ASSERT(0);
667		- goto out;
668		- }
669		- resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
670		- (unsigned int)(end_fsb - offset_fsb),
671		- XFS_DATA_FORK);
672		- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
673		- resblks, 0, XFS_TRANS_RESERVE \| XFS_TRANS_NOFS, &tp);
674		- if (error)
675		- goto out;
676		-
677	628	xfs_ilock(ip, XFS_ILOCK_EXCL);
678	629	xfs_trans_ijoin(tp, ip, 0);
679	630
..	..	@@ -682,80 +633,126 @@
682	633	* left by the time I/O completes for the loser of the race. In that
683	634	* case we are done.
684	635	*/
685		- if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
	636	+ if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) \|\|
	637	+ got.br_startoff + got.br_blockcount <= offset_fsb) {
	638	+ *end_fsb = offset_fsb;
	639	+ goto out_cancel;
	640	+ }
	641	+
	642	+ /*
	643	+ * Structure copy @got into @del, then trim @del to the range that we
	644	+ * were asked to remap. We preserve @got for the eventual CoW fork
	645	+ * deletion; from now on @del represents the mapping that we're
	646	+ * actually remapping.
	647	+ */
	648	+ del = got;
	649	+ xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
	650	+
	651	+ ASSERT(del.br_blockcount > 0);
	652	+
	653	+ /*
	654	+ * Only remap real extents that contain data. With AIO, speculative
	655	+ * preallocations can leak into the range we are called upon, and we
	656	+ * need to skip them.
	657	+ */
	658	+ if (!xfs_bmap_is_written_extent(&got)) {
	659	+ *end_fsb = del.br_startoff;
	660	+ goto out_cancel;
	661	+ }
	662	+
	663	+ /* Unmap the old blocks in the data fork. */
	664	+ rlen = del.br_blockcount;
	665	+ error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
	666	+ if (error)
686	667	goto out_cancel;
687	668
688		- /* Walk backwards until we're out of the I/O range... */
689		- while (got.br_startoff + got.br_blockcount > offset_fsb) {
690		- del = got;
691		- xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
	669	+ /* Trim the extent to whatever got unmapped. */
	670	+ xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
	671	+ trace_xfs_reflink_cow_remap(ip, &del);
692	672
693		- /* Extent delete may have bumped ext forward */
694		- if (!del.br_blockcount)
695		- goto prev_extent;
	673	+ /* Free the CoW orphan record. */
	674	+ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
696	675
697		- /*
698		- * Only remap real extent that contain data. With AIO
699		- * speculatively preallocations can leak into the range we
700		- * are called upon, and we need to skip them.
701		- */
702		- if (!xfs_bmap_is_real_extent(&got))
703		- goto prev_extent;
	676	+ /* Map the new blocks into the data fork. */
	677	+ xfs_bmap_map_extent(tp, ip, &del);
704	678
705		- /* Unmap the old blocks in the data fork. */
706		- ASSERT(tp->t_firstblock == NULLFSBLOCK);
707		- rlen = del.br_blockcount;
708		- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
709		- if (error)
710		- goto out_cancel;
	679	+ /* Charge this new data fork mapping to the on-disk quota. */
	680	+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
	681	+ (long)del.br_blockcount);
711	682
712		- /* Trim the extent to whatever got unmapped. */
713		- if (rlen) {
714		- xfs_trim_extent(&del, del.br_startoff + rlen,
715		- del.br_blockcount - rlen);
716		- }
717		- trace_xfs_reflink_cow_remap(ip, &del);
718		-
719		- /* Free the CoW orphan record. */
720		- error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
721		- del.br_blockcount);
722		- if (error)
723		- goto out_cancel;
724		-
725		- /* Map the new blocks into the data fork. */
726		- error = xfs_bmap_map_extent(tp, ip, &del);
727		- if (error)
728		- goto out_cancel;
729		-
730		- /* Charge this new data fork mapping to the on-disk quota. */
731		- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
732		- (long)del.br_blockcount);
733		-
734		- /* Remove the mapping from the CoW fork. */
735		- xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
736		-
737		- error = xfs_defer_finish(&tp);
738		- if (error)
739		- goto out_cancel;
740		- if (!xfs_iext_get_extent(ifp, &icur, &got))
741		- break;
742		- continue;
743		-prev_extent:
744		- if (!xfs_iext_prev_extent(ifp, &icur, &got))
745		- break;
746		- }
	683	+ /* Remove the mapping from the CoW fork. */
	684	+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
747	685
748	686	error = xfs_trans_commit(tp);
749	687	xfs_iunlock(ip, XFS_ILOCK_EXCL);
750	688	if (error)
751		- goto out;
	689	+ return error;
	690	+
	691	+ /* Update the caller about how much progress we made. */
	692	+ *end_fsb = del.br_startoff;
752	693	return 0;
753	694
754	695	out_cancel:
755	696	xfs_trans_cancel(tp);
756	697	xfs_iunlock(ip, XFS_ILOCK_EXCL);
757		-out:
758		- trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
	698	+ return error;
	699	+}
	700	+
	701	+/*
	702	+ * Remap parts of a file's data fork after a successful CoW.
	703	+ */
	704	+int
	705	+xfs_reflink_end_cow(
	706	+ struct xfs_inode *ip,
	707	+ xfs_off_t offset,
	708	+ xfs_off_t count)
	709	+{
	710	+ xfs_fileoff_t offset_fsb;
	711	+ xfs_fileoff_t end_fsb;
	712	+ int error = 0;
	713	+
	714	+ trace_xfs_reflink_end_cow(ip, offset, count);
	715	+
	716	+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
	717	+ end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
	718	+
	719	+ /*
	720	+ * Walk backwards until we're out of the I/O range. The loop function
	721	+ * repeatedly cycles the ILOCK to allocate one transaction per remapped
	722	+ * extent.
	723	+ *
	724	+ * If we're being called by writeback then the pages will still
	725	+ * have PageWriteback set, which prevents races with reflink remapping
	726	+ * and truncate. Reflink remapping prevents races with writeback by
	727	+ * taking the iolock and mmaplock before flushing the pages and
	728	+ * remapping, which means there won't be any further writeback or page
	729	+ * cache dirtying until the reflink completes.
	730	+ *
	731	+ * We should never have two threads issuing writeback for the same file
	732	+ * region. There are also have post-eof checks in the writeback
	733	+ * preparation code so that we don't bother writing out pages that are
	734	+ * about to be truncated.
	735	+ *
	736	+ * If we're being called as part of directio write completion, the dio
	737	+ * count is still elevated, which reflink and truncate will wait for.
	738	+ * Reflink remapping takes the iolock and mmaplock and waits for
	739	+ * pending dio to finish, which should prevent any directio until the
	740	+ * remap completes. Multiple concurrent directio writes to the same
	741	+ * region are handled by end_cow processing only occurring for the
	742	+ * threads which succeed; the outcome of multiple overlapping direct
	743	+ * writes is not well defined anyway.
	744	+ *
	745	+ * It's possible that a buffered write and a direct write could collide
	746	+ * here (the buffered write stumbles in after the dio flushes and
	747	+ * invalidates the page cache and immediately queues writeback), but we
	748	+ * have never supported this 100%. If either disk write succeeds the
	749	+ * blocks will be remapped.
	750	+ */
	751	+ while (end_fsb > offset_fsb && !error)
	752	+ error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
	753	+
	754	+ if (error)
	755	+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
759	756	return error;
760	757	}
761	758
..	..	@@ -917,18 +914,18 @@
917	914	/*
918	915	* Update destination inode size & cowextsize hint, if necessary.
919	916	*/
920		-STATIC int
	917	+int
921	918	xfs_reflink_update_dest(
922	919	struct xfs_inode *dest,
923	920	xfs_off_t newlen,
924	921	xfs_extlen_t cowextsize,
925		- bool is_dedupe)
	922	+ unsigned int remap_flags)
926	923	{
927	924	struct xfs_mount *mp = dest->i_mount;
928	925	struct xfs_trans *tp;
929	926	int error;
930	927
931		- if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
	928	+ if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
932	929	return 0;
933	930
934	931	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
..	..	@@ -949,10 +946,6 @@
949	946	dest->i_d.di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
950	947	}
951	948
952		- if (!is_dedupe) {
953		- xfs_trans_ichgtime(tp, dest,
954		- XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
955		- }
956	949	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
957	950
958	951	error = xfs_trans_commit(tp);
..	..	@@ -991,41 +984,28 @@
991	984	}
992	985
993	986	/*
994		- * Unmap a range of blocks from a file, then map other blocks into the hole.
995		- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
996		- * The extent irec is mapped into dest at irec->br_startoff.
	987	+ * Remap the given extent into the file. The dmap blockcount will be set to
	988	+ * the number of blocks that were actually remapped.
997	989	*/
998	990	STATIC int
999	991	xfs_reflink_remap_extent(
1000	992	struct xfs_inode *ip,
1001		- struct xfs_bmbt_irec *irec,
1002		- xfs_fileoff_t destoff,
	993	+ struct xfs_bmbt_irec *dmap,
1003	994	xfs_off_t new_isize)
1004	995	{
	996	+ struct xfs_bmbt_irec smap;
1005	997	struct xfs_mount *mp = ip->i_mount;
1006		- bool real_extent = xfs_bmap_is_real_extent(irec);
1007	998	struct xfs_trans *tp;
1008		- unsigned int resblks;
1009		- struct xfs_bmbt_irec uirec;
1010		- xfs_filblks_t rlen;
1011		- xfs_filblks_t unmap_len;
1012	999	xfs_off_t newlen;
1013		- int64_t qres;
	1000	+ int64_t qres, qdelta;
	1001	+ unsigned int resblks;
	1002	+ bool smap_real;
	1003	+ bool dmap_written = xfs_bmap_is_written_extent(dmap);
	1004	+ int nimaps;
1014	1005	int error;
1015	1006
1016		- unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
1017		- trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
1018		-
1019		- /* No reflinking if we're low on space */
1020		- if (real_extent) {
1021		- error = xfs_reflink_ag_has_free_space(mp,
1022		- XFS_FSB_TO_AGNO(mp, irec->br_startblock));
1023		- if (error)
1024		- goto out;
1025		- }
1026		-
1027	1007	/* Start a rolling transaction to switch the mappings */
1028		- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
	1008	+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
1029	1009	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
1030	1010	if (error)
1031	1011	goto out;
..	..	@@ -1034,116 +1014,172 @@
1034	1014	xfs_trans_ijoin(tp, ip, 0);
1035	1015
1036	1016	/*
1037		- * Reserve quota for this operation. We don't know if the first unmap
1038		- * in the dest file will cause a bmap btree split, so we always reserve
1039		- * at least enough blocks for that split. If the extent being mapped
1040		- * in is written, we need to reserve quota for that too.
	1017	+ * Read what's currently mapped in the destination file into smap.
	1018	+ * If smap isn't a hole, we will have to remove it before we can add
	1019	+ * dmap to the destination file.
1041	1020	*/
1042		- qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
1043		- if (real_extent)
1044		- qres += irec->br_blockcount;
1045		- error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
1046		- XFS_QMOPT_RES_REGBLKS);
	1021	+ nimaps = 1;
	1022	+ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
	1023	+ &smap, &nimaps, 0);
1047	1024	if (error)
1048	1025	goto out_cancel;
	1026	+ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
	1027	+ smap_real = xfs_bmap_is_real_extent(&smap);
1049	1028
1050		- trace_xfs_reflink_remap(ip, irec->br_startoff,
1051		- irec->br_blockcount, irec->br_startblock);
	1029	+ /*
	1030	+ * We can only remap as many blocks as the smaller of the two extent
	1031	+ * maps, because we can only remap one extent at a time.
	1032	+ */
	1033	+ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
	1034	+ ASSERT(dmap->br_blockcount == smap.br_blockcount);
1052	1035
1053		- /* Unmap the old blocks in the data fork. */
1054		- rlen = unmap_len;
1055		- while (rlen) {
1056		- ASSERT(tp->t_firstblock == NULLFSBLOCK);
1057		- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
1058		- if (error)
1059		- goto out_cancel;
	1036	+ trace_xfs_reflink_remap_extent_dest(ip, &smap);
1060	1037
1061		- /*
1062		- * Trim the extent to whatever got unmapped.
1063		- * Remember, bunmapi works backwards.
1064		- */
1065		- uirec.br_startblock = irec->br_startblock + rlen;
1066		- uirec.br_startoff = irec->br_startoff + rlen;
1067		- uirec.br_blockcount = unmap_len - rlen;
1068		- uirec.br_state = irec->br_state;
1069		- unmap_len = rlen;
	1038	+ /*
	1039	+ * Two extents mapped to the same physical block must not have
	1040	+ * different states; that's filesystem corruption. Move on to the next
	1041	+ * extent if they're both holes or both the same physical extent.
	1042	+ */
	1043	+ if (dmap->br_startblock == smap.br_startblock) {
	1044	+ if (dmap->br_state != smap.br_state)
	1045	+ error = -EFSCORRUPTED;
	1046	+ goto out_cancel;
	1047	+ }
1070	1048
1071		- /* If this isn't a real mapping, we're done. */
1072		- if (!real_extent \|\| uirec.br_blockcount == 0)
1073		- goto next_extent;
	1049	+ /* If both extents are unwritten, leave them alone. */
	1050	+ if (dmap->br_state == XFS_EXT_UNWRITTEN &&
	1051	+ smap.br_state == XFS_EXT_UNWRITTEN)
	1052	+ goto out_cancel;
1074	1053
1075		- trace_xfs_reflink_remap(ip, uirec.br_startoff,
1076		- uirec.br_blockcount, uirec.br_startblock);
1077		-
1078		- /* Update the refcount tree */
1079		- error = xfs_refcount_increase_extent(tp, &uirec);
1080		- if (error)
1081		- goto out_cancel;
1082		-
1083		- /* Map the new blocks into the data fork. */
1084		- error = xfs_bmap_map_extent(tp, ip, &uirec);
1085		- if (error)
1086		- goto out_cancel;
1087		-
1088		- /* Update quota accounting. */
1089		- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
1090		- uirec.br_blockcount);
1091		-
1092		- /* Update dest isize if needed. */
1093		- newlen = XFS_FSB_TO_B(mp,
1094		- uirec.br_startoff + uirec.br_blockcount);
1095		- newlen = min_t(xfs_off_t, newlen, new_isize);
1096		- if (newlen > i_size_read(VFS_I(ip))) {
1097		- trace_xfs_reflink_update_inode_size(ip, newlen);
1098		- i_size_write(VFS_I(ip), newlen);
1099		- ip->i_d.di_size = newlen;
1100		- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1101		- }
1102		-
1103		-next_extent:
1104		- /* Process all the deferred stuff. */
1105		- error = xfs_defer_finish(&tp);
	1054	+ /* No reflinking if the AG of the dest mapping is low on space. */
	1055	+ if (dmap_written) {
	1056	+ error = xfs_reflink_ag_has_free_space(mp,
	1057	+ XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
1106	1058	if (error)
1107	1059	goto out_cancel;
1108	1060	}
1109	1061
	1062	+ /*
	1063	+ * Compute quota reservation if we think the quota block counter for
	1064	+ * this file could increase.
	1065	+ *
	1066	+ * Adding a written extent to the extent map can cause a bmbt split,
	1067	+ * and removing a mapped extent from the extent can cause a bmbt split.
	1068	+ * The two operations cannot both cause a split since they operate on
	1069	+ * the same index in the bmap btree, so we only need a reservation for
	1070	+ * one bmbt split if either thing is happening.
	1071	+ *
	1072	+ * If we are mapping a written extent into the file, we need to have
	1073	+ * enough quota block count reservation to handle the blocks in that
	1074	+ * extent. We log only the delta to the quota block counts, so if the
	1075	+ * extent we're unmapping also has blocks allocated to it, we don't
	1076	+ * need a quota reservation for the extent itself.
	1077	+ *
	1078	+ * Note that if we're replacing a delalloc reservation with a written
	1079	+ * extent, we have to take the full quota reservation because removing
	1080	+ * the delalloc reservation gives the block count back to the quota
	1081	+ * count. This is suboptimal, but the VFS flushed the dest range
	1082	+ * before we started. That should have removed all the delalloc
	1083	+ * reservations, but we code defensively.
	1084	+ */
	1085	+ qres = qdelta = 0;
	1086	+ if (smap_real \|\| dmap_written)
	1087	+ qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
	1088	+ if (!smap_real && dmap_written)
	1089	+ qres += dmap->br_blockcount;
	1090	+ if (qres > 0) {
	1091	+ error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
	1092	+ XFS_QMOPT_RES_REGBLKS);
	1093	+ if (error)
	1094	+ goto out_cancel;
	1095	+ }
	1096	+
	1097	+ if (smap_real) {
	1098	+ /*
	1099	+ * If the extent we're unmapping is backed by storage (written
	1100	+ * or not), unmap the extent and drop its refcount.
	1101	+ */
	1102	+ xfs_bmap_unmap_extent(tp, ip, &smap);
	1103	+ xfs_refcount_decrease_extent(tp, &smap);
	1104	+ qdelta -= smap.br_blockcount;
	1105	+ } else if (smap.br_startblock == DELAYSTARTBLOCK) {
	1106	+ xfs_filblks_t len = smap.br_blockcount;
	1107	+
	1108	+ /*
	1109	+ * If the extent we're unmapping is a delalloc reservation,
	1110	+ * we can use the regular bunmapi function to release the
	1111	+ * incore state. Dropping the delalloc reservation takes care
	1112	+ * of the quota reservation for us.
	1113	+ */
	1114	+ error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
	1115	+ if (error)
	1116	+ goto out_cancel;
	1117	+ ASSERT(len == 0);
	1118	+ }
	1119	+
	1120	+ /*
	1121	+ * If the extent we're sharing is backed by written storage, increase
	1122	+ * its refcount and map it into the file.
	1123	+ */
	1124	+ if (dmap_written) {
	1125	+ xfs_refcount_increase_extent(tp, dmap);
	1126	+ xfs_bmap_map_extent(tp, ip, dmap);
	1127	+ qdelta += dmap->br_blockcount;
	1128	+ }
	1129	+
	1130	+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
	1131	+
	1132	+ /* Update dest isize if needed. */
	1133	+ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
	1134	+ newlen = min_t(xfs_off_t, newlen, new_isize);
	1135	+ if (newlen > i_size_read(VFS_I(ip))) {
	1136	+ trace_xfs_reflink_update_inode_size(ip, newlen);
	1137	+ i_size_write(VFS_I(ip), newlen);
	1138	+ ip->i_d.di_size = newlen;
	1139	+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	1140	+ }
	1141	+
	1142	+ /* Commit everything and unlock. */
1110	1143	error = xfs_trans_commit(tp);
1111		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
1112		- if (error)
1113		- goto out;
1114		- return 0;
	1144	+ goto out_unlock;
1115	1145
1116	1146	out_cancel:
1117	1147	xfs_trans_cancel(tp);
	1148	+out_unlock:
1118	1149	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1119	1150	out:
1120		- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
	1151	+ if (error)
	1152	+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
1121	1153	return error;
1122	1154	}
1123	1155
1124		-/*
1125		- * Iteratively remap one file's extents (and holes) to another's.
1126		- */
1127		-STATIC int
	1156	+/* Remap a range of one file to the other. */
	1157	+int
1128	1158	xfs_reflink_remap_blocks(
1129	1159	struct xfs_inode *src,
1130		- xfs_fileoff_t srcoff,
	1160	+ loff_t pos_in,
1131	1161	struct xfs_inode *dest,
1132		- xfs_fileoff_t destoff,
1133		- xfs_filblks_t len,
1134		- xfs_off_t new_isize)
	1162	+ loff_t pos_out,
	1163	+ loff_t remap_len,
	1164	+ loff_t *remapped)
1135	1165	{
1136	1166	struct xfs_bmbt_irec imap;
	1167	+ struct xfs_mount *mp = src->i_mount;
	1168	+ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
	1169	+ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
	1170	+ xfs_filblks_t len;
	1171	+ xfs_filblks_t remapped_len = 0;
	1172	+ xfs_off_t new_isize = pos_out + remap_len;
1137	1173	int nimaps;
1138	1174	int error = 0;
1139		- xfs_filblks_t range_len;
1140	1175
1141		- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
1142		- while (len) {
1143		- uint lock_mode;
	1176	+ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
	1177	+ XFS_MAX_FILEOFF);
1144	1178
1145		- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
1146		- dest, destoff);
	1179	+ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
	1180	+
	1181	+ while (len > 0) {
	1182	+ unsigned int lock_mode;
1147	1183
1148	1184	/* Read extent from the source file */
1149	1185	nimaps = 1;
..	..	@@ -1151,102 +1187,46 @@
1151	1187	error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
1152	1188	xfs_iunlock(src, lock_mode);
1153	1189	if (error)
1154		- goto err;
1155		- ASSERT(nimaps == 1);
	1190	+ break;
	1191	+ /*
	1192	+ * The caller supposedly flushed all dirty pages in the source
	1193	+ * file range, which means that writeback should have allocated
	1194	+ * or deleted all delalloc reservations in that range. If we
	1195	+ * find one, that's a good sign that something is seriously
	1196	+ * wrong here.
	1197	+ */
	1198	+ ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
	1199	+ if (imap.br_startblock == DELAYSTARTBLOCK) {
	1200	+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
	1201	+ error = -EFSCORRUPTED;
	1202	+ break;
	1203	+ }
1156	1204
1157		- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
1158		- &imap);
	1205	+ trace_xfs_reflink_remap_extent_src(src, &imap);
1159	1206
1160		- /* Translate imap into the destination file. */
1161		- range_len = imap.br_startoff + imap.br_blockcount - srcoff;
1162		- imap.br_startoff += destoff - srcoff;
1163		-
1164		- /* Clear dest from destoff to the end of imap and map it in. */
1165		- error = xfs_reflink_remap_extent(dest, &imap, destoff,
1166		- new_isize);
	1207	+ /* Remap into the destination file at the given offset. */
	1208	+ imap.br_startoff = destoff;
	1209	+ error = xfs_reflink_remap_extent(dest, &imap, new_isize);
1167	1210	if (error)
1168		- goto err;
	1211	+ break;
1169	1212
1170	1213	if (fatal_signal_pending(current)) {
1171	1214	error = -EINTR;
1172		- goto err;
	1215	+ break;
1173	1216	}
1174	1217
1175	1218	/* Advance drange/srange */
1176		- srcoff += range_len;
1177		- destoff += range_len;
1178		- len -= range_len;
	1219	+ srcoff += imap.br_blockcount;
	1220	+ destoff += imap.br_blockcount;
	1221	+ len -= imap.br_blockcount;
	1222	+ remapped_len += imap.br_blockcount;
1179	1223	}
1180	1224
1181		- return 0;
1182		-
1183		-err:
1184		- trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
	1225	+ if (error)
	1226	+ trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
	1227	+ *remapped = min_t(loff_t, remap_len,
	1228	+ XFS_FSB_TO_B(src->i_mount, remapped_len));
1185	1229	return error;
1186		-}
1187		-
1188		-/*
1189		- * Grab the exclusive iolock for a data copy from src to dest, making
1190		- * sure to abide vfs locking order (lowest pointer value goes first) and
1191		- * breaking the pnfs layout leases on dest before proceeding. The loop
1192		- * is needed because we cannot call the blocking break_layout() with the
1193		- * src iolock held, and therefore have to back out both locks.
1194		- */
1195		-static int
1196		-xfs_iolock_two_inodes_and_break_layout(
1197		- struct inode *src,
1198		- struct inode *dest)
1199		-{
1200		- int error;
1201		-
1202		-retry:
1203		- if (src < dest) {
1204		- inode_lock_shared(src);
1205		- inode_lock_nested(dest, I_MUTEX_NONDIR2);
1206		- } else {
1207		- /* src >= dest */
1208		- inode_lock(dest);
1209		- }
1210		-
1211		- error = break_layout(dest, false);
1212		- if (error == -EWOULDBLOCK) {
1213		- inode_unlock(dest);
1214		- if (src < dest)
1215		- inode_unlock_shared(src);
1216		- error = break_layout(dest, true);
1217		- if (error)
1218		- return error;
1219		- goto retry;
1220		- }
1221		- if (error) {
1222		- inode_unlock(dest);
1223		- if (src < dest)
1224		- inode_unlock_shared(src);
1225		- return error;
1226		- }
1227		- if (src > dest)
1228		- inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
1229		- return 0;
1230		-}
1231		-
1232		-/* Unlock both inodes after they've been prepped for a range clone. */
1233		-STATIC void
1234		-xfs_reflink_remap_unlock(
1235		- struct file *file_in,
1236		- struct file *file_out)
1237		-{
1238		- struct inode *inode_in = file_inode(file_in);
1239		- struct xfs_inode *src = XFS_I(inode_in);
1240		- struct inode *inode_out = file_inode(file_out);
1241		- struct xfs_inode *dest = XFS_I(inode_out);
1242		- bool same_inode = (inode_in == inode_out);
1243		-
1244		- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
1245		- if (!same_inode)
1246		- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
1247		- inode_unlock(inode_out);
1248		- if (!same_inode)
1249		- inode_unlock_shared(inode_in);
1250	1230	}
1251	1231
1252	1232	/*
..	..	@@ -1266,7 +1246,7 @@
1266	1246
1267	1247	trace_xfs_zero_eof(ip, isize, pos - isize);
1268	1248	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
1269		- &xfs_iomap_ops);
	1249	+ &xfs_buffered_write_iomap_ops);
1270	1250	}
1271	1251
1272	1252	/*
..	..	@@ -1298,32 +1278,25 @@
1298	1278	* stale data in the destination file. Hence we reject these clone attempts with
1299	1279	* -EINVAL in this case.
1300	1280	*/
1301		-STATIC int
	1281	+int
1302	1282	xfs_reflink_remap_prep(
1303	1283	struct file *file_in,
1304	1284	loff_t pos_in,
1305	1285	struct file *file_out,
1306	1286	loff_t pos_out,
1307		- u64 *len,
1308		- bool is_dedupe)
	1287	+ loff_t *len,
	1288	+ unsigned int remap_flags)
1309	1289	{
1310	1290	struct inode *inode_in = file_inode(file_in);
1311	1291	struct xfs_inode *src = XFS_I(inode_in);
1312	1292	struct inode *inode_out = file_inode(file_out);
1313	1293	struct xfs_inode *dest = XFS_I(inode_out);
1314		- bool same_inode = (inode_in == inode_out);
1315		- u64 blkmask = i_blocksize(inode_in) - 1;
1316		- ssize_t ret;
	1294	+ int ret;
1317	1295
1318	1296	/* Lock both files against IO */
1319		- ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
	1297	+ ret = xfs_ilock2_io_mmap(src, dest);
1320	1298	if (ret)
1321	1299	return ret;
1322		- if (same_inode)
1323		- xfs_ilock(src, XFS_MMAPLOCK_EXCL);
1324		- else
1325		- xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
1326		- XFS_MMAPLOCK_EXCL);
1327	1300
1328	1301	/* Check file eligibility and prepare for block sharing. */
1329	1302	ret = -EINVAL;
..	..	@@ -1335,28 +1308,10 @@
1335	1308	if (IS_DAX(inode_in) \|\| IS_DAX(inode_out))
1336	1309	goto out_unlock;
1337	1310
1338		- ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
1339		- len, is_dedupe);
1340		- if (ret <= 0)
	1311	+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
	1312	+ len, remap_flags);
	1313	+ if (ret \|\| *len == 0)
1341	1314	goto out_unlock;
1342		-
1343		- /*
1344		- * If the dedupe data matches, chop off the partial EOF block
1345		- * from the source file so we don't try to dedupe the partial
1346		- * EOF block.
1347		- */
1348		- if (is_dedupe) {
1349		- *len &= ~blkmask;
1350		- } else if (*len & blkmask) {
1351		- /*
1352		- * The user is attempting to share a partial EOF block,
1353		- * if it's inside the destination EOF then reject it.
1354		- */
1355		- if (pos_out + *len < i_size_read(inode_out)) {
1356		- ret = -EINVAL;
1357		- goto out_unlock;
1358		- }
1359		- }
1360	1315
1361	1316	/* Attach dquots to dest inode before changing block map */
1362	1317	ret = xfs_qm_dqattach(dest);
..	..	@@ -1390,178 +1345,10 @@
1390	1345	if (ret)
1391	1346	goto out_unlock;
1392	1347
1393		- /* If we're altering the file contents... */
1394		- if (!is_dedupe) {
1395		- /*
1396		- * ...update the timestamps (which will grab the ilock again
1397		- * from xfs_fs_dirty_inode, so we have to call it before we
1398		- * take the ilock).
1399		- */
1400		- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
1401		- ret = file_update_time(file_out);
1402		- if (ret)
1403		- goto out_unlock;
1404		- }
1405		-
1406		- /*
1407		- * ...clear the security bits if the process is not being run
1408		- * by root. This keeps people from modifying setuid and setgid
1409		- * binaries.
1410		- */
1411		- ret = file_remove_privs(file_out);
1412		- if (ret)
1413		- goto out_unlock;
1414		- }
1415		-
1416		- return 1;
	1348	+ return 0;
1417	1349	out_unlock:
1418		- xfs_reflink_remap_unlock(file_in, file_out);
	1350	+ xfs_iunlock2_io_mmap(src, dest);
1419	1351	return ret;
1420		-}
1421		-
1422		-/*
1423		- * Link a range of blocks from one file to another.
1424		- */
1425		-int
1426		-xfs_reflink_remap_range(
1427		- struct file *file_in,
1428		- loff_t pos_in,
1429		- struct file *file_out,
1430		- loff_t pos_out,
1431		- u64 len,
1432		- bool is_dedupe)
1433		-{
1434		- struct inode *inode_in = file_inode(file_in);
1435		- struct xfs_inode *src = XFS_I(inode_in);
1436		- struct inode *inode_out = file_inode(file_out);
1437		- struct xfs_inode *dest = XFS_I(inode_out);
1438		- struct xfs_mount *mp = src->i_mount;
1439		- xfs_fileoff_t sfsbno, dfsbno;
1440		- xfs_filblks_t fsblen;
1441		- xfs_extlen_t cowextsize;
1442		- ssize_t ret;
1443		-
1444		- if (!xfs_sb_version_hasreflink(&mp->m_sb))
1445		- return -EOPNOTSUPP;
1446		-
1447		- if (XFS_FORCED_SHUTDOWN(mp))
1448		- return -EIO;
1449		-
1450		- /* Prepare and then clone file data. */
1451		- ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1452		- &len, is_dedupe);
1453		- if (ret <= 0)
1454		- return ret;
1455		-
1456		- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1457		-
1458		- dfsbno = XFS_B_TO_FSBT(mp, pos_out);
1459		- sfsbno = XFS_B_TO_FSBT(mp, pos_in);
1460		- fsblen = XFS_B_TO_FSB(mp, len);
1461		- ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
1462		- pos_out + len);
1463		- if (ret)
1464		- goto out_unlock;
1465		-
1466		- /*
1467		- * Carry the cowextsize hint from src to dest if we're sharing the
1468		- * entire source file to the entire destination file, the source file
1469		- * has a cowextsize hint, and the destination file does not.
1470		- */
1471		- cowextsize = 0;
1472		- if (pos_in == 0 && len == i_size_read(inode_in) &&
1473		- (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1474		- pos_out == 0 && len >= i_size_read(inode_out) &&
1475		- !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1476		- cowextsize = src->i_d.di_cowextsize;
1477		-
1478		- ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1479		- is_dedupe);
1480		-
1481		-out_unlock:
1482		- xfs_reflink_remap_unlock(file_in, file_out);
1483		- if (ret)
1484		- trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1485		- return ret;
1486		-}
1487		-
1488		-/*
1489		- * The user wants to preemptively CoW all shared blocks in this file,
1490		- * which enables us to turn off the reflink flag. Iterate all
1491		- * extents which are not prealloc/delalloc to see which ranges are
1492		- * mentioned in the refcount tree, then read those blocks into the
1493		- * pagecache, dirty them, fsync them back out, and then we can update
1494		- * the inode flag. What happens if we run out of memory? :)
1495		- */
1496		-STATIC int
1497		-xfs_reflink_dirty_extents(
1498		- struct xfs_inode *ip,
1499		- xfs_fileoff_t fbno,
1500		- xfs_filblks_t end,
1501		- xfs_off_t isize)
1502		-{
1503		- struct xfs_mount *mp = ip->i_mount;
1504		- xfs_agnumber_t agno;
1505		- xfs_agblock_t agbno;
1506		- xfs_extlen_t aglen;
1507		- xfs_agblock_t rbno;
1508		- xfs_extlen_t rlen;
1509		- xfs_off_t fpos;
1510		- xfs_off_t flen;
1511		- struct xfs_bmbt_irec map[2];
1512		- int nmaps;
1513		- int error = 0;
1514		-
1515		- while (end - fbno > 0) {
1516		- nmaps = 1;
1517		- /*
1518		- * Look for extents in the file. Skip holes, delalloc, or
1519		- * unwritten extents; they can't be reflinked.
1520		- */
1521		- error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
1522		- if (error)
1523		- goto out;
1524		- if (nmaps == 0)
1525		- break;
1526		- if (!xfs_bmap_is_real_extent(&map[0]))
1527		- goto next;
1528		-
1529		- map[1] = map[0];
1530		- while (map[1].br_blockcount) {
1531		- agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
1532		- agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
1533		- aglen = map[1].br_blockcount;
1534		-
1535		- error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
1536		- aglen, &rbno, &rlen, true);
1537		- if (error)
1538		- goto out;
1539		- if (rbno == NULLAGBLOCK)
1540		- break;
1541		-
1542		- /* Dirty the pages */
1543		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
1544		- fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
1545		- (rbno - agbno));
1546		- flen = XFS_FSB_TO_B(mp, rlen);
1547		- if (fpos + flen > isize)
1548		- flen = isize - fpos;
1549		- error = iomap_file_dirty(VFS_I(ip), fpos, flen,
1550		- &xfs_iomap_ops);
1551		- xfs_ilock(ip, XFS_ILOCK_EXCL);
1552		- if (error)
1553		- goto out;
1554		-
1555		- map[1].br_blockcount -= (rbno - agbno + rlen);
1556		- map[1].br_startoff += (rbno - agbno + rlen);
1557		- map[1].br_startblock += (rbno - agbno + rlen);
1558		- }
1559		-
1560		-next:
1561		- fbno = map[0].br_startoff + map[0].br_blockcount;
1562		- }
1563		-out:
1564		- return error;
1565	1352	}
1566	1353
1567	1354	/* Does this inode need the reflink flag? */
..	..	@@ -1640,7 +1427,8 @@
1640	1427	* We didn't find any shared blocks so turn off the reflink flag.
1641	1428	* First, get rid of any leftover CoW mappings.
1642	1429	*/
1643		- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
	1430	+ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
	1431	+ true);
1644	1432	if (error)
1645	1433	return error;
1646	1434
..	..	@@ -1700,10 +1488,7 @@
1700	1488	xfs_off_t offset,
1701	1489	xfs_off_t len)
1702	1490	{
1703		- struct xfs_mount *mp = ip->i_mount;
1704		- xfs_fileoff_t fbno;
1705		- xfs_filblks_t end;
1706		- xfs_off_t isize;
	1491	+ struct inode *inode = VFS_I(ip);
1707	1492	int error;
1708	1493
1709	1494	if (!xfs_is_reflink_inode(ip))
..	..	@@ -1711,20 +1496,15 @@
1711	1496
1712	1497	trace_xfs_reflink_unshare(ip, offset, len);
1713	1498
1714		- inode_dio_wait(VFS_I(ip));
	1499	+ inode_dio_wait(inode);
1715	1500
1716		- /* Try to CoW the selected ranges */
1717		- xfs_ilock(ip, XFS_ILOCK_EXCL);
1718		- fbno = XFS_B_TO_FSBT(mp, offset);
1719		- isize = i_size_read(VFS_I(ip));
1720		- end = XFS_B_TO_FSB(mp, offset + len);
1721		- error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
	1501	+ error = iomap_file_unshare(inode, offset, len,
	1502	+ &xfs_buffered_write_iomap_ops);
1722	1503	if (error)
1723		- goto out_unlock;
1724		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
	1504	+ goto out;
1725	1505
1726		- /* Wait for the IO to finish */
1727		- error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
	1506	+ error = filemap_write_and_wait_range(inode->i_mapping, offset,
	1507	+ offset + len - 1);
1728	1508	if (error)
1729	1509	goto out;
1730	1510
..	..	@@ -1732,11 +1512,8 @@
1732	1512	error = xfs_reflink_try_clear_inode_flag(ip);
1733	1513	if (error)
1734	1514	goto out;
1735		-
1736	1515	return 0;
1737	1516
1738		-out_unlock:
1739		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
1740	1517	out:
1741	1518	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
1742	1519	return error;