hc
2024-05-10 10ebd8556b7990499c896a550e3d416b444211e6
kernel/fs/xfs/xfs_reflink.c
....@@ -11,21 +11,12 @@
1111 #include "xfs_trans_resv.h"
1212 #include "xfs_mount.h"
1313 #include "xfs_defer.h"
14
-#include "xfs_da_format.h"
15
-#include "xfs_da_btree.h"
1614 #include "xfs_inode.h"
1715 #include "xfs_trans.h"
18
-#include "xfs_inode_item.h"
1916 #include "xfs_bmap.h"
2017 #include "xfs_bmap_util.h"
21
-#include "xfs_error.h"
22
-#include "xfs_dir2.h"
23
-#include "xfs_dir2_priv.h"
24
-#include "xfs_ioctl.h"
2518 #include "xfs_trace.h"
26
-#include "xfs_log.h"
2719 #include "xfs_icache.h"
28
-#include "xfs_pnfs.h"
2920 #include "xfs_btree.h"
3021 #include "xfs_refcount_btree.h"
3122 #include "xfs_refcount.h"
....@@ -33,11 +24,9 @@
3324 #include "xfs_trans_space.h"
3425 #include "xfs_bit.h"
3526 #include "xfs_alloc.h"
36
-#include "xfs_quota_defs.h"
3727 #include "xfs_quota.h"
3828 #include "xfs_reflink.h"
3929 #include "xfs_iomap.h"
40
-#include "xfs_rmap_btree.h"
4130 #include "xfs_sb.h"
4231 #include "xfs_ag_resv.h"
4332
....@@ -154,8 +143,6 @@
154143 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
155144 if (error)
156145 return error;
157
- if (!agbp)
158
- return -ENOMEM;
159146
160147 cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
161148
....@@ -182,8 +169,7 @@
182169 xfs_reflink_trim_around_shared(
183170 struct xfs_inode *ip,
184171 struct xfs_bmbt_irec *irec,
185
- bool *shared,
186
- bool *trimmed)
172
+ bool *shared)
187173 {
188174 xfs_agnumber_t agno;
189175 xfs_agblock_t agbno;
....@@ -193,7 +179,7 @@
193179 int error = 0;
194180
195181 /* Holes, unwritten, and delalloc extents cannot be shared */
196
- if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
182
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
197183 *shared = false;
198184 return 0;
199185 }
....@@ -209,7 +195,7 @@
209195 if (error)
210196 return error;
211197
212
- *shared = *trimmed = false;
198
+ *shared = false;
213199 if (fbno == NULLAGBLOCK) {
214200 /* No shared blocks at all. */
215201 return 0;
....@@ -222,8 +208,6 @@
222208 */
223209 irec->br_blockcount = flen;
224210 *shared = true;
225
- if (flen != aglen)
226
- *trimmed = true;
227211 return 0;
228212 } else {
229213 /*
....@@ -233,100 +217,63 @@
233217 * start of the shared region.
234218 */
235219 irec->br_blockcount = fbno - agbno;
236
- *trimmed = true;
237220 return 0;
238221 }
239222 }
240223
241
-/*
242
- * Trim the passed in imap to the next shared/unshared extent boundary, and
243
- * if imap->br_startoff points to a shared extent reserve space for it in the
244
- * COW fork. In this case *shared is set to true, else to false.
245
- *
246
- * Note that imap will always contain the block numbers for the existing blocks
247
- * in the data fork, as the upper layers need them for read-modify-write
248
- * operations.
249
- */
250224 int
251
-xfs_reflink_reserve_cow(
225
+xfs_bmap_trim_cow(
252226 struct xfs_inode *ip,
253227 struct xfs_bmbt_irec *imap,
254228 bool *shared)
255229 {
256
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
257
- struct xfs_bmbt_irec got;
258
- int error = 0;
259
- bool eof = false, trimmed;
260
- struct xfs_iext_cursor icur;
261
-
262
- /*
263
- * Search the COW fork extent list first. This serves two purposes:
264
- * first this implement the speculative preallocation using cowextisze,
265
- * so that we also unshared block adjacent to shared blocks instead
266
- * of just the shared blocks themselves. Second the lookup in the
267
- * extent list is generally faster than going out to the shared extent
268
- * tree.
269
- */
270
-
271
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
272
- eof = true;
273
- if (!eof && got.br_startoff <= imap->br_startoff) {
274
- trace_xfs_reflink_cow_found(ip, imap);
275
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
276
-
230
+ /* We can't update any real extents in always COW mode. */
231
+ if (xfs_is_always_cow_inode(ip) &&
232
+ !isnullstartblock(imap->br_startblock)) {
277233 *shared = true;
278234 return 0;
279235 }
280236
281237 /* Trim the mapping to the nearest shared extent boundary. */
282
- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
283
- if (error)
284
- return error;
285
-
286
- /* Not shared? Just report the (potentially capped) extent. */
287
- if (!*shared)
288
- return 0;
289
-
290
- /*
291
- * Fork all the shared blocks from our write offset until the end of
292
- * the extent.
293
- */
294
- error = xfs_qm_dqattach_locked(ip, false);
295
- if (error)
296
- return error;
297
-
298
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
299
- imap->br_blockcount, 0, &got, &icur, eof);
300
- if (error == -ENOSPC || error == -EDQUOT)
301
- trace_xfs_reflink_cow_enospc(ip, imap);
302
- if (error)
303
- return error;
304
-
305
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
306
- trace_xfs_reflink_cow_alloc(ip, &got);
307
- return 0;
238
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
308239 }
309240
310
-/* Convert part of an unwritten CoW extent to a real one. */
311
-STATIC int
312
-xfs_reflink_convert_cow_extent(
313
- struct xfs_inode *ip,
314
- struct xfs_bmbt_irec *imap,
315
- xfs_fileoff_t offset_fsb,
316
- xfs_filblks_t count_fsb)
241
+static int
242
+xfs_reflink_convert_cow_locked(
243
+ struct xfs_inode *ip,
244
+ xfs_fileoff_t offset_fsb,
245
+ xfs_filblks_t count_fsb)
317246 {
318
- int nimaps = 1;
247
+ struct xfs_iext_cursor icur;
248
+ struct xfs_bmbt_irec got;
249
+ struct xfs_btree_cur *dummy_cur = NULL;
250
+ int dummy_logflags;
251
+ int error = 0;
319252
320
- if (imap->br_state == XFS_EXT_NORM)
253
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
321254 return 0;
322255
323
- xfs_trim_extent(imap, offset_fsb, count_fsb);
324
- trace_xfs_reflink_convert_cow(ip, imap);
325
- if (imap->br_blockcount == 0)
326
- return 0;
327
- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
328
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
329
- &nimaps);
256
+ do {
257
+ if (got.br_startoff >= offset_fsb + count_fsb)
258
+ break;
259
+ if (got.br_state == XFS_EXT_NORM)
260
+ continue;
261
+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
262
+ return -EIO;
263
+
264
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
265
+ if (!got.br_blockcount)
266
+ continue;
267
+
268
+ got.br_state = XFS_EXT_NORM;
269
+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
270
+ XFS_COW_FORK, &icur, &dummy_cur, &got,
271
+ &dummy_logflags);
272
+ if (error)
273
+ return error;
274
+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
275
+
276
+ return error;
330277 }
331278
332279 /* Convert all of the unwritten CoW extents in a file's range to real ones. */
....@@ -340,15 +287,12 @@
340287 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
341288 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
342289 xfs_filblks_t count_fsb = end_fsb - offset_fsb;
343
- struct xfs_bmbt_irec imap;
344
- int nimaps = 1, error = 0;
290
+ int error;
345291
346292 ASSERT(count != 0);
347293
348294 xfs_ilock(ip, XFS_ILOCK_EXCL);
349
- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
350
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
351
- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
295
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
352296 xfs_iunlock(ip, XFS_ILOCK_EXCL);
353297 return error;
354298 }
....@@ -362,14 +306,13 @@
362306 xfs_find_trim_cow_extent(
363307 struct xfs_inode *ip,
364308 struct xfs_bmbt_irec *imap,
309
+ struct xfs_bmbt_irec *cmap,
365310 bool *shared,
366311 bool *found)
367312 {
368313 xfs_fileoff_t offset_fsb = imap->br_startoff;
369314 xfs_filblks_t count_fsb = imap->br_blockcount;
370315 struct xfs_iext_cursor icur;
371
- struct xfs_bmbt_irec got;
372
- bool trimmed;
373316
374317 *found = false;
375318
....@@ -377,19 +320,22 @@
377320 * If we don't find an overlapping extent, trim the range we need to
378321 * allocate to fit the hole we found.
379322 */
380
- if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
381
- got.br_startoff > offset_fsb)
382
- return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
323
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
324
+ cmap->br_startoff = offset_fsb + count_fsb;
325
+ if (cmap->br_startoff > offset_fsb) {
326
+ xfs_trim_extent(imap, imap->br_startoff,
327
+ cmap->br_startoff - imap->br_startoff);
328
+ return xfs_bmap_trim_cow(ip, imap, shared);
329
+ }
383330
384331 *shared = true;
385
- if (isnullstartblock(got.br_startblock)) {
386
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
332
+ if (isnullstartblock(cmap->br_startblock)) {
333
+ xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
387334 return 0;
388335 }
389336
390337 /* real extent found - no need to allocate */
391
- xfs_trim_extent(&got, offset_fsb, count_fsb);
392
- *imap = got;
338
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
393339 *found = true;
394340 return 0;
395341 }
....@@ -399,8 +345,10 @@
399345 xfs_reflink_allocate_cow(
400346 struct xfs_inode *ip,
401347 struct xfs_bmbt_irec *imap,
348
+ struct xfs_bmbt_irec *cmap,
402349 bool *shared,
403
- uint *lockmode)
350
+ uint *lockmode,
351
+ bool convert_now)
404352 {
405353 struct xfs_mount *mp = ip->i_mount;
406354 xfs_fileoff_t offset_fsb = imap->br_startoff;
....@@ -412,9 +360,12 @@
412360 xfs_extlen_t resblks = 0;
413361
414362 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
415
- ASSERT(xfs_is_reflink_inode(ip));
363
+ if (!ip->i_cowfp) {
364
+ ASSERT(!xfs_is_reflink_inode(ip));
365
+ xfs_ifork_init_cow(ip);
366
+ }
416367
417
- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
368
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
418369 if (error || !*shared)
419370 return error;
420371 if (found)
....@@ -439,7 +390,7 @@
439390 /*
440391 * Check for an overlapping extent again now that we dropped the ilock.
441392 */
442
- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
393
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
443394 if (error || !*shared)
444395 goto out_trans_cancel;
445396 if (found) {
....@@ -457,8 +408,8 @@
457408 /* Allocate the entire reservation as unwritten blocks. */
458409 nimaps = 1;
459410 error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
460
- XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
461
- resblks, imap, &nimaps);
411
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
412
+ &nimaps);
462413 if (error)
463414 goto out_unreserve;
464415
....@@ -474,7 +425,16 @@
474425 if (nimaps == 0)
475426 return -ENOSPC;
476427 convert:
477
- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
428
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
429
+ /*
430
+ * COW fork extents are supposed to remain unwritten until we're ready
431
+ * to initiate a disk write. For direct I/O we are going to write the
432
+ * data and need the conversion, but for buffered writes we're done.
433
+ */
434
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
435
+ return 0;
436
+ trace_xfs_reflink_convert_cow(ip, cmap);
437
+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
478438
479439 out_unreserve:
480440 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
....@@ -533,10 +493,8 @@
533493 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
534494
535495 /* Free the CoW orphan record. */
536
- error = xfs_refcount_free_cow_extent(*tpp,
537
- del.br_startblock, del.br_blockcount);
538
- if (error)
539
- break;
496
+ xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
497
+ del.br_blockcount);
540498
541499 xfs_bmap_add_free(*tpp, del.br_startblock,
542500 del.br_blockcount, NULL);
....@@ -589,7 +547,7 @@
589547 int error;
590548
591549 trace_xfs_reflink_cancel_cow_range(ip, offset, count);
592
- ASSERT(xfs_is_reflink_inode(ip));
550
+ ASSERT(ip->i_cowfp);
593551
594552 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
595553 if (count == NULLFILEOFF)
....@@ -599,7 +557,7 @@
599557
600558 /* Start a rolling transaction to remove the mappings */
601559 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
602
- 0, 0, XFS_TRANS_NOFS, &tp);
560
+ 0, 0, 0, &tp);
603561 if (error)
604562 goto out;
605563
....@@ -626,54 +584,47 @@
626584 }
627585
628586 /*
629
- * Remap parts of a file's data fork after a successful CoW.
587
+ * Remap part of the CoW fork into the data fork.
588
+ *
589
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
590
+ * into the data fork; this function will remap what it can (at the end of the
591
+ * range) and update @end_fsb appropriately. Each remap gets its own
592
+ * transaction because we can end up merging and splitting bmbt blocks for
593
+ * every remap operation and we'd like to keep the block reservation
594
+ * requirements as low as possible.
630595 */
631
-int
632
-xfs_reflink_end_cow(
633
- struct xfs_inode *ip,
634
- xfs_off_t offset,
635
- xfs_off_t count)
596
+STATIC int
597
+xfs_reflink_end_cow_extent(
598
+ struct xfs_inode *ip,
599
+ xfs_fileoff_t offset_fsb,
600
+ xfs_fileoff_t *end_fsb)
636601 {
637
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
638
- struct xfs_bmbt_irec got, del;
639
- struct xfs_trans *tp;
640
- xfs_fileoff_t offset_fsb;
641
- xfs_fileoff_t end_fsb;
642
- int error;
643
- unsigned int resblks;
644
- xfs_filblks_t rlen;
645
- struct xfs_iext_cursor icur;
646
-
647
- trace_xfs_reflink_end_cow(ip, offset, count);
602
+ struct xfs_bmbt_irec got, del;
603
+ struct xfs_iext_cursor icur;
604
+ struct xfs_mount *mp = ip->i_mount;
605
+ struct xfs_trans *tp;
606
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
607
+ xfs_filblks_t rlen;
608
+ unsigned int resblks;
609
+ int error;
648610
649611 /* No COW extents? That's easy! */
650
- if (ifp->if_bytes == 0)
612
+ if (ifp->if_bytes == 0) {
613
+ *end_fsb = offset_fsb;
651614 return 0;
615
+ }
652616
653
- offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
654
- end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
617
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
618
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
619
+ XFS_TRANS_RESERVE, &tp);
620
+ if (error)
621
+ return error;
655622
656623 /*
657
- * Start a rolling transaction to switch the mappings. We're
658
- * unlikely ever to have to remap 16T worth of single-block
659
- * extents, so just cap the worst case extent count to 2^32-1.
660
- * Stick a warning in just in case, and avoid 64-bit division.
624
+ * Lock the inode. We have to ijoin without automatic unlock because
625
+ * the lead transaction is the refcountbt record deletion; the data
626
+ * fork update follows as a deferred log item.
661627 */
662
- BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
663
- if (end_fsb - offset_fsb > UINT_MAX) {
664
- error = -EFSCORRUPTED;
665
- xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
666
- ASSERT(0);
667
- goto out;
668
- }
669
- resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
670
- (unsigned int)(end_fsb - offset_fsb),
671
- XFS_DATA_FORK);
672
- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
673
- resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
674
- if (error)
675
- goto out;
676
-
677628 xfs_ilock(ip, XFS_ILOCK_EXCL);
678629 xfs_trans_ijoin(tp, ip, 0);
679630
....@@ -682,80 +633,126 @@
682633 * left by the time I/O completes for the loser of the race. In that
683634 * case we are done.
684635 */
685
- if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
636
+ if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
637
+ got.br_startoff + got.br_blockcount <= offset_fsb) {
638
+ *end_fsb = offset_fsb;
639
+ goto out_cancel;
640
+ }
641
+
642
+ /*
643
+ * Structure copy @got into @del, then trim @del to the range that we
644
+ * were asked to remap. We preserve @got for the eventual CoW fork
645
+ * deletion; from now on @del represents the mapping that we're
646
+ * actually remapping.
647
+ */
648
+ del = got;
649
+ xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
650
+
651
+ ASSERT(del.br_blockcount > 0);
652
+
653
+ /*
654
+ * Only remap real extents that contain data. With AIO, speculative
655
+ * preallocations can leak into the range we are called upon, and we
656
+ * need to skip them.
657
+ */
658
+ if (!xfs_bmap_is_written_extent(&got)) {
659
+ *end_fsb = del.br_startoff;
660
+ goto out_cancel;
661
+ }
662
+
663
+ /* Unmap the old blocks in the data fork. */
664
+ rlen = del.br_blockcount;
665
+ error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
666
+ if (error)
686667 goto out_cancel;
687668
688
- /* Walk backwards until we're out of the I/O range... */
689
- while (got.br_startoff + got.br_blockcount > offset_fsb) {
690
- del = got;
691
- xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
669
+ /* Trim the extent to whatever got unmapped. */
670
+ xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
671
+ trace_xfs_reflink_cow_remap(ip, &del);
692672
693
- /* Extent delete may have bumped ext forward */
694
- if (!del.br_blockcount)
695
- goto prev_extent;
673
+ /* Free the CoW orphan record. */
674
+ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
696675
697
- /*
698
- * Only remap real extent that contain data. With AIO
699
- * speculatively preallocations can leak into the range we
700
- * are called upon, and we need to skip them.
701
- */
702
- if (!xfs_bmap_is_real_extent(&got))
703
- goto prev_extent;
676
+ /* Map the new blocks into the data fork. */
677
+ xfs_bmap_map_extent(tp, ip, &del);
704678
705
- /* Unmap the old blocks in the data fork. */
706
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
707
- rlen = del.br_blockcount;
708
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
709
- if (error)
710
- goto out_cancel;
679
+ /* Charge this new data fork mapping to the on-disk quota. */
680
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
681
+ (long)del.br_blockcount);
711682
712
- /* Trim the extent to whatever got unmapped. */
713
- if (rlen) {
714
- xfs_trim_extent(&del, del.br_startoff + rlen,
715
- del.br_blockcount - rlen);
716
- }
717
- trace_xfs_reflink_cow_remap(ip, &del);
718
-
719
- /* Free the CoW orphan record. */
720
- error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
721
- del.br_blockcount);
722
- if (error)
723
- goto out_cancel;
724
-
725
- /* Map the new blocks into the data fork. */
726
- error = xfs_bmap_map_extent(tp, ip, &del);
727
- if (error)
728
- goto out_cancel;
729
-
730
- /* Charge this new data fork mapping to the on-disk quota. */
731
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
732
- (long)del.br_blockcount);
733
-
734
- /* Remove the mapping from the CoW fork. */
735
- xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
736
-
737
- error = xfs_defer_finish(&tp);
738
- if (error)
739
- goto out_cancel;
740
- if (!xfs_iext_get_extent(ifp, &icur, &got))
741
- break;
742
- continue;
743
-prev_extent:
744
- if (!xfs_iext_prev_extent(ifp, &icur, &got))
745
- break;
746
- }
683
+ /* Remove the mapping from the CoW fork. */
684
+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
747685
748686 error = xfs_trans_commit(tp);
749687 xfs_iunlock(ip, XFS_ILOCK_EXCL);
750688 if (error)
751
- goto out;
689
+ return error;
690
+
691
+ /* Update the caller about how much progress we made. */
692
+ *end_fsb = del.br_startoff;
752693 return 0;
753694
754695 out_cancel:
755696 xfs_trans_cancel(tp);
756697 xfs_iunlock(ip, XFS_ILOCK_EXCL);
757
-out:
758
- trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
698
+ return error;
699
+}
700
+
701
+/*
702
+ * Remap parts of a file's data fork after a successful CoW.
703
+ */
704
+int
705
+xfs_reflink_end_cow(
706
+ struct xfs_inode *ip,
707
+ xfs_off_t offset,
708
+ xfs_off_t count)
709
+{
710
+ xfs_fileoff_t offset_fsb;
711
+ xfs_fileoff_t end_fsb;
712
+ int error = 0;
713
+
714
+ trace_xfs_reflink_end_cow(ip, offset, count);
715
+
716
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
717
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
718
+
719
+ /*
720
+ * Walk backwards until we're out of the I/O range. The loop function
721
+ * repeatedly cycles the ILOCK to allocate one transaction per remapped
722
+ * extent.
723
+ *
724
+ * If we're being called by writeback then the pages will still
725
+ * have PageWriteback set, which prevents races with reflink remapping
726
+ * and truncate. Reflink remapping prevents races with writeback by
727
+ * taking the iolock and mmaplock before flushing the pages and
728
+ * remapping, which means there won't be any further writeback or page
729
+ * cache dirtying until the reflink completes.
730
+ *
731
+ * We should never have two threads issuing writeback for the same file
732
+ * region. There are also have post-eof checks in the writeback
733
+ * preparation code so that we don't bother writing out pages that are
734
+ * about to be truncated.
735
+ *
736
+ * If we're being called as part of directio write completion, the dio
737
+ * count is still elevated, which reflink and truncate will wait for.
738
+ * Reflink remapping takes the iolock and mmaplock and waits for
739
+ * pending dio to finish, which should prevent any directio until the
740
+ * remap completes. Multiple concurrent directio writes to the same
741
+ * region are handled by end_cow processing only occurring for the
742
+ * threads which succeed; the outcome of multiple overlapping direct
743
+ * writes is not well defined anyway.
744
+ *
745
+ * It's possible that a buffered write and a direct write could collide
746
+ * here (the buffered write stumbles in after the dio flushes and
747
+ * invalidates the page cache and immediately queues writeback), but we
748
+ * have never supported this 100%. If either disk write succeeds the
749
+ * blocks will be remapped.
750
+ */
751
+ while (end_fsb > offset_fsb && !error)
752
+ error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
753
+
754
+ if (error)
755
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
759756 return error;
760757 }
761758
....@@ -917,18 +914,18 @@
917914 /*
918915 * Update destination inode size & cowextsize hint, if necessary.
919916 */
920
-STATIC int
917
+int
921918 xfs_reflink_update_dest(
922919 struct xfs_inode *dest,
923920 xfs_off_t newlen,
924921 xfs_extlen_t cowextsize,
925
- bool is_dedupe)
922
+ unsigned int remap_flags)
926923 {
927924 struct xfs_mount *mp = dest->i_mount;
928925 struct xfs_trans *tp;
929926 int error;
930927
931
- if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
928
+ if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
932929 return 0;
933930
934931 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
....@@ -949,10 +946,6 @@
949946 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
950947 }
951948
952
- if (!is_dedupe) {
953
- xfs_trans_ichgtime(tp, dest,
954
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
955
- }
956949 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
957950
958951 error = xfs_trans_commit(tp);
....@@ -991,41 +984,28 @@
991984 }
992985
993986 /*
994
- * Unmap a range of blocks from a file, then map other blocks into the hole.
995
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
996
- * The extent irec is mapped into dest at irec->br_startoff.
987
+ * Remap the given extent into the file. The dmap blockcount will be set to
988
+ * the number of blocks that were actually remapped.
997989 */
998990 STATIC int
999991 xfs_reflink_remap_extent(
1000992 struct xfs_inode *ip,
1001
- struct xfs_bmbt_irec *irec,
1002
- xfs_fileoff_t destoff,
993
+ struct xfs_bmbt_irec *dmap,
1003994 xfs_off_t new_isize)
1004995 {
996
+ struct xfs_bmbt_irec smap;
1005997 struct xfs_mount *mp = ip->i_mount;
1006
- bool real_extent = xfs_bmap_is_real_extent(irec);
1007998 struct xfs_trans *tp;
1008
- unsigned int resblks;
1009
- struct xfs_bmbt_irec uirec;
1010
- xfs_filblks_t rlen;
1011
- xfs_filblks_t unmap_len;
1012999 xfs_off_t newlen;
1013
- int64_t qres;
1000
+ int64_t qres, qdelta;
1001
+ unsigned int resblks;
1002
+ bool smap_real;
1003
+ bool dmap_written = xfs_bmap_is_written_extent(dmap);
1004
+ int nimaps;
10141005 int error;
10151006
1016
- unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
1017
- trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
1018
-
1019
- /* No reflinking if we're low on space */
1020
- if (real_extent) {
1021
- error = xfs_reflink_ag_has_free_space(mp,
1022
- XFS_FSB_TO_AGNO(mp, irec->br_startblock));
1023
- if (error)
1024
- goto out;
1025
- }
1026
-
10271007 /* Start a rolling transaction to switch the mappings */
1028
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
1008
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
10291009 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
10301010 if (error)
10311011 goto out;
....@@ -1034,116 +1014,172 @@
10341014 xfs_trans_ijoin(tp, ip, 0);
10351015
10361016 /*
1037
- * Reserve quota for this operation. We don't know if the first unmap
1038
- * in the dest file will cause a bmap btree split, so we always reserve
1039
- * at least enough blocks for that split. If the extent being mapped
1040
- * in is written, we need to reserve quota for that too.
1017
+ * Read what's currently mapped in the destination file into smap.
1018
+ * If smap isn't a hole, we will have to remove it before we can add
1019
+ * dmap to the destination file.
10411020 */
1042
- qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
1043
- if (real_extent)
1044
- qres += irec->br_blockcount;
1045
- error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
1046
- XFS_QMOPT_RES_REGBLKS);
1021
+ nimaps = 1;
1022
+ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
1023
+ &smap, &nimaps, 0);
10471024 if (error)
10481025 goto out_cancel;
1026
+ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
1027
+ smap_real = xfs_bmap_is_real_extent(&smap);
10491028
1050
- trace_xfs_reflink_remap(ip, irec->br_startoff,
1051
- irec->br_blockcount, irec->br_startblock);
1029
+ /*
1030
+ * We can only remap as many blocks as the smaller of the two extent
1031
+ * maps, because we can only remap one extent at a time.
1032
+ */
1033
+ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
1034
+ ASSERT(dmap->br_blockcount == smap.br_blockcount);
10521035
1053
- /* Unmap the old blocks in the data fork. */
1054
- rlen = unmap_len;
1055
- while (rlen) {
1056
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
1057
- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
1058
- if (error)
1059
- goto out_cancel;
1036
+ trace_xfs_reflink_remap_extent_dest(ip, &smap);
10601037
1061
- /*
1062
- * Trim the extent to whatever got unmapped.
1063
- * Remember, bunmapi works backwards.
1064
- */
1065
- uirec.br_startblock = irec->br_startblock + rlen;
1066
- uirec.br_startoff = irec->br_startoff + rlen;
1067
- uirec.br_blockcount = unmap_len - rlen;
1068
- uirec.br_state = irec->br_state;
1069
- unmap_len = rlen;
1038
+ /*
1039
+ * Two extents mapped to the same physical block must not have
1040
+ * different states; that's filesystem corruption. Move on to the next
1041
+ * extent if they're both holes or both the same physical extent.
1042
+ */
1043
+ if (dmap->br_startblock == smap.br_startblock) {
1044
+ if (dmap->br_state != smap.br_state)
1045
+ error = -EFSCORRUPTED;
1046
+ goto out_cancel;
1047
+ }
10701048
1071
- /* If this isn't a real mapping, we're done. */
1072
- if (!real_extent || uirec.br_blockcount == 0)
1073
- goto next_extent;
1049
+ /* If both extents are unwritten, leave them alone. */
1050
+ if (dmap->br_state == XFS_EXT_UNWRITTEN &&
1051
+ smap.br_state == XFS_EXT_UNWRITTEN)
1052
+ goto out_cancel;
10741053
1075
- trace_xfs_reflink_remap(ip, uirec.br_startoff,
1076
- uirec.br_blockcount, uirec.br_startblock);
1077
-
1078
- /* Update the refcount tree */
1079
- error = xfs_refcount_increase_extent(tp, &uirec);
1080
- if (error)
1081
- goto out_cancel;
1082
-
1083
- /* Map the new blocks into the data fork. */
1084
- error = xfs_bmap_map_extent(tp, ip, &uirec);
1085
- if (error)
1086
- goto out_cancel;
1087
-
1088
- /* Update quota accounting. */
1089
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
1090
- uirec.br_blockcount);
1091
-
1092
- /* Update dest isize if needed. */
1093
- newlen = XFS_FSB_TO_B(mp,
1094
- uirec.br_startoff + uirec.br_blockcount);
1095
- newlen = min_t(xfs_off_t, newlen, new_isize);
1096
- if (newlen > i_size_read(VFS_I(ip))) {
1097
- trace_xfs_reflink_update_inode_size(ip, newlen);
1098
- i_size_write(VFS_I(ip), newlen);
1099
- ip->i_d.di_size = newlen;
1100
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1101
- }
1102
-
1103
-next_extent:
1104
- /* Process all the deferred stuff. */
1105
- error = xfs_defer_finish(&tp);
1054
+ /* No reflinking if the AG of the dest mapping is low on space. */
1055
+ if (dmap_written) {
1056
+ error = xfs_reflink_ag_has_free_space(mp,
1057
+ XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
11061058 if (error)
11071059 goto out_cancel;
11081060 }
11091061
1062
+ /*
1063
+ * Compute quota reservation if we think the quota block counter for
1064
+ * this file could increase.
1065
+ *
1066
+ * Adding a written extent to the extent map can cause a bmbt split,
1067
+ * and removing a mapped extent from the extent can cause a bmbt split.
1068
+ * The two operations cannot both cause a split since they operate on
1069
+ * the same index in the bmap btree, so we only need a reservation for
1070
+ * one bmbt split if either thing is happening.
1071
+ *
1072
+ * If we are mapping a written extent into the file, we need to have
1073
+ * enough quota block count reservation to handle the blocks in that
1074
+ * extent. We log only the delta to the quota block counts, so if the
1075
+ * extent we're unmapping also has blocks allocated to it, we don't
1076
+ * need a quota reservation for the extent itself.
1077
+ *
1078
+ * Note that if we're replacing a delalloc reservation with a written
1079
+ * extent, we have to take the full quota reservation because removing
1080
+ * the delalloc reservation gives the block count back to the quota
1081
+ * count. This is suboptimal, but the VFS flushed the dest range
1082
+ * before we started. That should have removed all the delalloc
1083
+ * reservations, but we code defensively.
1084
+ */
1085
+ qres = qdelta = 0;
1086
+ if (smap_real || dmap_written)
1087
+ qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
1088
+ if (!smap_real && dmap_written)
1089
+ qres += dmap->br_blockcount;
1090
+ if (qres > 0) {
1091
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
1092
+ XFS_QMOPT_RES_REGBLKS);
1093
+ if (error)
1094
+ goto out_cancel;
1095
+ }
1096
+
1097
+ if (smap_real) {
1098
+ /*
1099
+ * If the extent we're unmapping is backed by storage (written
1100
+ * or not), unmap the extent and drop its refcount.
1101
+ */
1102
+ xfs_bmap_unmap_extent(tp, ip, &smap);
1103
+ xfs_refcount_decrease_extent(tp, &smap);
1104
+ qdelta -= smap.br_blockcount;
1105
+ } else if (smap.br_startblock == DELAYSTARTBLOCK) {
1106
+ xfs_filblks_t len = smap.br_blockcount;
1107
+
1108
+ /*
1109
+ * If the extent we're unmapping is a delalloc reservation,
1110
+ * we can use the regular bunmapi function to release the
1111
+ * incore state. Dropping the delalloc reservation takes care
1112
+ * of the quota reservation for us.
1113
+ */
1114
+ error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
1115
+ if (error)
1116
+ goto out_cancel;
1117
+ ASSERT(len == 0);
1118
+ }
1119
+
1120
+ /*
1121
+ * If the extent we're sharing is backed by written storage, increase
1122
+ * its refcount and map it into the file.
1123
+ */
1124
+ if (dmap_written) {
1125
+ xfs_refcount_increase_extent(tp, dmap);
1126
+ xfs_bmap_map_extent(tp, ip, dmap);
1127
+ qdelta += dmap->br_blockcount;
1128
+ }
1129
+
1130
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
1131
+
1132
+ /* Update dest isize if needed. */
1133
+ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
1134
+ newlen = min_t(xfs_off_t, newlen, new_isize);
1135
+ if (newlen > i_size_read(VFS_I(ip))) {
1136
+ trace_xfs_reflink_update_inode_size(ip, newlen);
1137
+ i_size_write(VFS_I(ip), newlen);
1138
+ ip->i_d.di_size = newlen;
1139
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1140
+ }
1141
+
1142
+ /* Commit everything and unlock. */
11101143 error = xfs_trans_commit(tp);
1111
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
1112
- if (error)
1113
- goto out;
1114
- return 0;
1144
+ goto out_unlock;
11151145
11161146 out_cancel:
11171147 xfs_trans_cancel(tp);
1148
+out_unlock:
11181149 xfs_iunlock(ip, XFS_ILOCK_EXCL);
11191150 out:
1120
- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
1151
+ if (error)
1152
+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
11211153 return error;
11221154 }
11231155
1124
-/*
1125
- * Iteratively remap one file's extents (and holes) to another's.
1126
- */
1127
-STATIC int
1156
+/* Remap a range of one file to the other. */
1157
+int
11281158 xfs_reflink_remap_blocks(
11291159 struct xfs_inode *src,
1130
- xfs_fileoff_t srcoff,
1160
+ loff_t pos_in,
11311161 struct xfs_inode *dest,
1132
- xfs_fileoff_t destoff,
1133
- xfs_filblks_t len,
1134
- xfs_off_t new_isize)
1162
+ loff_t pos_out,
1163
+ loff_t remap_len,
1164
+ loff_t *remapped)
11351165 {
11361166 struct xfs_bmbt_irec imap;
1167
+ struct xfs_mount *mp = src->i_mount;
1168
+ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
1169
+ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
1170
+ xfs_filblks_t len;
1171
+ xfs_filblks_t remapped_len = 0;
1172
+ xfs_off_t new_isize = pos_out + remap_len;
11371173 int nimaps;
11381174 int error = 0;
1139
- xfs_filblks_t range_len;
11401175
1141
- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
1142
- while (len) {
1143
- uint lock_mode;
1176
+ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
1177
+ XFS_MAX_FILEOFF);
11441178
1145
- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
1146
- dest, destoff);
1179
+ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
1180
+
1181
+ while (len > 0) {
1182
+ unsigned int lock_mode;
11471183
11481184 /* Read extent from the source file */
11491185 nimaps = 1;
....@@ -1151,102 +1187,46 @@
11511187 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
11521188 xfs_iunlock(src, lock_mode);
11531189 if (error)
1154
- goto err;
1155
- ASSERT(nimaps == 1);
1190
+ break;
1191
+ /*
1192
+ * The caller supposedly flushed all dirty pages in the source
1193
+ * file range, which means that writeback should have allocated
1194
+ * or deleted all delalloc reservations in that range. If we
1195
+ * find one, that's a good sign that something is seriously
1196
+ * wrong here.
1197
+ */
1198
+ ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
1199
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
1200
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1201
+ error = -EFSCORRUPTED;
1202
+ break;
1203
+ }
11561204
1157
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
1158
- &imap);
1205
+ trace_xfs_reflink_remap_extent_src(src, &imap);
11591206
1160
- /* Translate imap into the destination file. */
1161
- range_len = imap.br_startoff + imap.br_blockcount - srcoff;
1162
- imap.br_startoff += destoff - srcoff;
1163
-
1164
- /* Clear dest from destoff to the end of imap and map it in. */
1165
- error = xfs_reflink_remap_extent(dest, &imap, destoff,
1166
- new_isize);
1207
+ /* Remap into the destination file at the given offset. */
1208
+ imap.br_startoff = destoff;
1209
+ error = xfs_reflink_remap_extent(dest, &imap, new_isize);
11671210 if (error)
1168
- goto err;
1211
+ break;
11691212
11701213 if (fatal_signal_pending(current)) {
11711214 error = -EINTR;
1172
- goto err;
1215
+ break;
11731216 }
11741217
11751218 /* Advance drange/srange */
1176
- srcoff += range_len;
1177
- destoff += range_len;
1178
- len -= range_len;
1219
+ srcoff += imap.br_blockcount;
1220
+ destoff += imap.br_blockcount;
1221
+ len -= imap.br_blockcount;
1222
+ remapped_len += imap.br_blockcount;
11791223 }
11801224
1181
- return 0;
1182
-
1183
-err:
1184
- trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
1225
+ if (error)
1226
+ trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
1227
+ *remapped = min_t(loff_t, remap_len,
1228
+ XFS_FSB_TO_B(src->i_mount, remapped_len));
11851229 return error;
1186
-}
1187
-
1188
-/*
1189
- * Grab the exclusive iolock for a data copy from src to dest, making
1190
- * sure to abide vfs locking order (lowest pointer value goes first) and
1191
- * breaking the pnfs layout leases on dest before proceeding. The loop
1192
- * is needed because we cannot call the blocking break_layout() with the
1193
- * src iolock held, and therefore have to back out both locks.
1194
- */
1195
-static int
1196
-xfs_iolock_two_inodes_and_break_layout(
1197
- struct inode *src,
1198
- struct inode *dest)
1199
-{
1200
- int error;
1201
-
1202
-retry:
1203
- if (src < dest) {
1204
- inode_lock_shared(src);
1205
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
1206
- } else {
1207
- /* src >= dest */
1208
- inode_lock(dest);
1209
- }
1210
-
1211
- error = break_layout(dest, false);
1212
- if (error == -EWOULDBLOCK) {
1213
- inode_unlock(dest);
1214
- if (src < dest)
1215
- inode_unlock_shared(src);
1216
- error = break_layout(dest, true);
1217
- if (error)
1218
- return error;
1219
- goto retry;
1220
- }
1221
- if (error) {
1222
- inode_unlock(dest);
1223
- if (src < dest)
1224
- inode_unlock_shared(src);
1225
- return error;
1226
- }
1227
- if (src > dest)
1228
- inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
1229
- return 0;
1230
-}
1231
-
1232
-/* Unlock both inodes after they've been prepped for a range clone. */
1233
-STATIC void
1234
-xfs_reflink_remap_unlock(
1235
- struct file *file_in,
1236
- struct file *file_out)
1237
-{
1238
- struct inode *inode_in = file_inode(file_in);
1239
- struct xfs_inode *src = XFS_I(inode_in);
1240
- struct inode *inode_out = file_inode(file_out);
1241
- struct xfs_inode *dest = XFS_I(inode_out);
1242
- bool same_inode = (inode_in == inode_out);
1243
-
1244
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
1245
- if (!same_inode)
1246
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
1247
- inode_unlock(inode_out);
1248
- if (!same_inode)
1249
- inode_unlock_shared(inode_in);
12501230 }
12511231
12521232 /*
....@@ -1266,7 +1246,7 @@
12661246
12671247 trace_xfs_zero_eof(ip, isize, pos - isize);
12681248 return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
1269
- &xfs_iomap_ops);
1249
+ &xfs_buffered_write_iomap_ops);
12701250 }
12711251
12721252 /*
....@@ -1298,32 +1278,25 @@
12981278 * stale data in the destination file. Hence we reject these clone attempts with
12991279 * -EINVAL in this case.
13001280 */
1301
-STATIC int
1281
+int
13021282 xfs_reflink_remap_prep(
13031283 struct file *file_in,
13041284 loff_t pos_in,
13051285 struct file *file_out,
13061286 loff_t pos_out,
1307
- u64 *len,
1308
- bool is_dedupe)
1287
+ loff_t *len,
1288
+ unsigned int remap_flags)
13091289 {
13101290 struct inode *inode_in = file_inode(file_in);
13111291 struct xfs_inode *src = XFS_I(inode_in);
13121292 struct inode *inode_out = file_inode(file_out);
13131293 struct xfs_inode *dest = XFS_I(inode_out);
1314
- bool same_inode = (inode_in == inode_out);
1315
- u64 blkmask = i_blocksize(inode_in) - 1;
1316
- ssize_t ret;
1294
+ int ret;
13171295
13181296 /* Lock both files against IO */
1319
- ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
1297
+ ret = xfs_ilock2_io_mmap(src, dest);
13201298 if (ret)
13211299 return ret;
1322
- if (same_inode)
1323
- xfs_ilock(src, XFS_MMAPLOCK_EXCL);
1324
- else
1325
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
1326
- XFS_MMAPLOCK_EXCL);
13271300
13281301 /* Check file eligibility and prepare for block sharing. */
13291302 ret = -EINVAL;
....@@ -1335,28 +1308,10 @@
13351308 if (IS_DAX(inode_in) || IS_DAX(inode_out))
13361309 goto out_unlock;
13371310
1338
- ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
1339
- len, is_dedupe);
1340
- if (ret <= 0)
1311
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
1312
+ len, remap_flags);
1313
+ if (ret || *len == 0)
13411314 goto out_unlock;
1342
-
1343
- /*
1344
- * If the dedupe data matches, chop off the partial EOF block
1345
- * from the source file so we don't try to dedupe the partial
1346
- * EOF block.
1347
- */
1348
- if (is_dedupe) {
1349
- *len &= ~blkmask;
1350
- } else if (*len & blkmask) {
1351
- /*
1352
- * The user is attempting to share a partial EOF block,
1353
- * if it's inside the destination EOF then reject it.
1354
- */
1355
- if (pos_out + *len < i_size_read(inode_out)) {
1356
- ret = -EINVAL;
1357
- goto out_unlock;
1358
- }
1359
- }
13601315
13611316 /* Attach dquots to dest inode before changing block map */
13621317 ret = xfs_qm_dqattach(dest);
....@@ -1390,178 +1345,10 @@
13901345 if (ret)
13911346 goto out_unlock;
13921347
1393
- /* If we're altering the file contents... */
1394
- if (!is_dedupe) {
1395
- /*
1396
- * ...update the timestamps (which will grab the ilock again
1397
- * from xfs_fs_dirty_inode, so we have to call it before we
1398
- * take the ilock).
1399
- */
1400
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
1401
- ret = file_update_time(file_out);
1402
- if (ret)
1403
- goto out_unlock;
1404
- }
1405
-
1406
- /*
1407
- * ...clear the security bits if the process is not being run
1408
- * by root. This keeps people from modifying setuid and setgid
1409
- * binaries.
1410
- */
1411
- ret = file_remove_privs(file_out);
1412
- if (ret)
1413
- goto out_unlock;
1414
- }
1415
-
1416
- return 1;
1348
+ return 0;
14171349 out_unlock:
1418
- xfs_reflink_remap_unlock(file_in, file_out);
1350
+ xfs_iunlock2_io_mmap(src, dest);
14191351 return ret;
1420
-}
1421
-
1422
-/*
1423
- * Link a range of blocks from one file to another.
1424
- */
1425
-int
1426
-xfs_reflink_remap_range(
1427
- struct file *file_in,
1428
- loff_t pos_in,
1429
- struct file *file_out,
1430
- loff_t pos_out,
1431
- u64 len,
1432
- bool is_dedupe)
1433
-{
1434
- struct inode *inode_in = file_inode(file_in);
1435
- struct xfs_inode *src = XFS_I(inode_in);
1436
- struct inode *inode_out = file_inode(file_out);
1437
- struct xfs_inode *dest = XFS_I(inode_out);
1438
- struct xfs_mount *mp = src->i_mount;
1439
- xfs_fileoff_t sfsbno, dfsbno;
1440
- xfs_filblks_t fsblen;
1441
- xfs_extlen_t cowextsize;
1442
- ssize_t ret;
1443
-
1444
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
1445
- return -EOPNOTSUPP;
1446
-
1447
- if (XFS_FORCED_SHUTDOWN(mp))
1448
- return -EIO;
1449
-
1450
- /* Prepare and then clone file data. */
1451
- ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1452
- &len, is_dedupe);
1453
- if (ret <= 0)
1454
- return ret;
1455
-
1456
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1457
-
1458
- dfsbno = XFS_B_TO_FSBT(mp, pos_out);
1459
- sfsbno = XFS_B_TO_FSBT(mp, pos_in);
1460
- fsblen = XFS_B_TO_FSB(mp, len);
1461
- ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
1462
- pos_out + len);
1463
- if (ret)
1464
- goto out_unlock;
1465
-
1466
- /*
1467
- * Carry the cowextsize hint from src to dest if we're sharing the
1468
- * entire source file to the entire destination file, the source file
1469
- * has a cowextsize hint, and the destination file does not.
1470
- */
1471
- cowextsize = 0;
1472
- if (pos_in == 0 && len == i_size_read(inode_in) &&
1473
- (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1474
- pos_out == 0 && len >= i_size_read(inode_out) &&
1475
- !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1476
- cowextsize = src->i_d.di_cowextsize;
1477
-
1478
- ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1479
- is_dedupe);
1480
-
1481
-out_unlock:
1482
- xfs_reflink_remap_unlock(file_in, file_out);
1483
- if (ret)
1484
- trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1485
- return ret;
1486
-}
1487
-
1488
-/*
1489
- * The user wants to preemptively CoW all shared blocks in this file,
1490
- * which enables us to turn off the reflink flag. Iterate all
1491
- * extents which are not prealloc/delalloc to see which ranges are
1492
- * mentioned in the refcount tree, then read those blocks into the
1493
- * pagecache, dirty them, fsync them back out, and then we can update
1494
- * the inode flag. What happens if we run out of memory? :)
1495
- */
1496
-STATIC int
1497
-xfs_reflink_dirty_extents(
1498
- struct xfs_inode *ip,
1499
- xfs_fileoff_t fbno,
1500
- xfs_filblks_t end,
1501
- xfs_off_t isize)
1502
-{
1503
- struct xfs_mount *mp = ip->i_mount;
1504
- xfs_agnumber_t agno;
1505
- xfs_agblock_t agbno;
1506
- xfs_extlen_t aglen;
1507
- xfs_agblock_t rbno;
1508
- xfs_extlen_t rlen;
1509
- xfs_off_t fpos;
1510
- xfs_off_t flen;
1511
- struct xfs_bmbt_irec map[2];
1512
- int nmaps;
1513
- int error = 0;
1514
-
1515
- while (end - fbno > 0) {
1516
- nmaps = 1;
1517
- /*
1518
- * Look for extents in the file. Skip holes, delalloc, or
1519
- * unwritten extents; they can't be reflinked.
1520
- */
1521
- error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
1522
- if (error)
1523
- goto out;
1524
- if (nmaps == 0)
1525
- break;
1526
- if (!xfs_bmap_is_real_extent(&map[0]))
1527
- goto next;
1528
-
1529
- map[1] = map[0];
1530
- while (map[1].br_blockcount) {
1531
- agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
1532
- agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
1533
- aglen = map[1].br_blockcount;
1534
-
1535
- error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
1536
- aglen, &rbno, &rlen, true);
1537
- if (error)
1538
- goto out;
1539
- if (rbno == NULLAGBLOCK)
1540
- break;
1541
-
1542
- /* Dirty the pages */
1543
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
1544
- fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
1545
- (rbno - agbno));
1546
- flen = XFS_FSB_TO_B(mp, rlen);
1547
- if (fpos + flen > isize)
1548
- flen = isize - fpos;
1549
- error = iomap_file_dirty(VFS_I(ip), fpos, flen,
1550
- &xfs_iomap_ops);
1551
- xfs_ilock(ip, XFS_ILOCK_EXCL);
1552
- if (error)
1553
- goto out;
1554
-
1555
- map[1].br_blockcount -= (rbno - agbno + rlen);
1556
- map[1].br_startoff += (rbno - agbno + rlen);
1557
- map[1].br_startblock += (rbno - agbno + rlen);
1558
- }
1559
-
1560
-next:
1561
- fbno = map[0].br_startoff + map[0].br_blockcount;
1562
- }
1563
-out:
1564
- return error;
15651352 }
15661353
15671354 /* Does this inode need the reflink flag? */
....@@ -1640,7 +1427,8 @@
16401427 * We didn't find any shared blocks so turn off the reflink flag.
16411428 * First, get rid of any leftover CoW mappings.
16421429 */
1643
- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
1430
+ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
1431
+ true);
16441432 if (error)
16451433 return error;
16461434
....@@ -1700,10 +1488,7 @@
17001488 xfs_off_t offset,
17011489 xfs_off_t len)
17021490 {
1703
- struct xfs_mount *mp = ip->i_mount;
1704
- xfs_fileoff_t fbno;
1705
- xfs_filblks_t end;
1706
- xfs_off_t isize;
1491
+ struct inode *inode = VFS_I(ip);
17071492 int error;
17081493
17091494 if (!xfs_is_reflink_inode(ip))
....@@ -1711,20 +1496,15 @@
17111496
17121497 trace_xfs_reflink_unshare(ip, offset, len);
17131498
1714
- inode_dio_wait(VFS_I(ip));
1499
+ inode_dio_wait(inode);
17151500
1716
- /* Try to CoW the selected ranges */
1717
- xfs_ilock(ip, XFS_ILOCK_EXCL);
1718
- fbno = XFS_B_TO_FSBT(mp, offset);
1719
- isize = i_size_read(VFS_I(ip));
1720
- end = XFS_B_TO_FSB(mp, offset + len);
1721
- error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
1501
+ error = iomap_file_unshare(inode, offset, len,
1502
+ &xfs_buffered_write_iomap_ops);
17221503 if (error)
1723
- goto out_unlock;
1724
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
1504
+ goto out;
17251505
1726
- /* Wait for the IO to finish */
1727
- error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1506
+ error = filemap_write_and_wait_range(inode->i_mapping, offset,
1507
+ offset + len - 1);
17281508 if (error)
17291509 goto out;
17301510
....@@ -1732,11 +1512,8 @@
17321512 error = xfs_reflink_try_clear_inode_flag(ip);
17331513 if (error)
17341514 goto out;
1735
-
17361515 return 0;
17371516
1738
-out_unlock:
1739
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
17401517 out:
17411518 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
17421519 return error;