hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/fs/read_write.c
....@@ -301,7 +301,7 @@
301301 }
302302 EXPORT_SYMBOL(vfs_llseek);
303303
304
-off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
304
+static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
305305 {
306306 off_t retval;
307307 struct fd f = fdget_pos(fd);
....@@ -331,7 +331,8 @@
331331 }
332332 #endif
333333
334
-#ifdef __ARCH_WANT_SYS_LLSEEK
334
+#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
335
+ defined(__ARCH_WANT_SYS_LLSEEK)
335336 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
336337 unsigned long, offset_low, loff_t __user *, result,
337338 unsigned int, whence)
....@@ -365,29 +366,37 @@
365366 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
366367 {
367368 struct inode *inode;
368
- loff_t pos;
369369 int retval = -EINVAL;
370370
371371 inode = file_inode(file);
372372 if (unlikely((ssize_t) count < 0))
373373 return retval;
374
- pos = *ppos;
375
- if (unlikely(pos < 0)) {
376
- if (!unsigned_offsets(file))
377
- return retval;
378
- if (count >= -pos) /* both values are in 0..LLONG_MAX */
379
- return -EOVERFLOW;
380
- } else if (unlikely((loff_t) (pos + count) < 0)) {
381
- if (!unsigned_offsets(file))
382
- return retval;
374
+
375
+ /*
376
+ * ranged mandatory locking does not apply to streams - it makes sense
377
+ * only for files where position has a meaning.
378
+ */
379
+ if (ppos) {
380
+ loff_t pos = *ppos;
381
+
382
+ if (unlikely(pos < 0)) {
383
+ if (!unsigned_offsets(file))
384
+ return retval;
385
+ if (count >= -pos) /* both values are in 0..LLONG_MAX */
386
+ return -EOVERFLOW;
387
+ } else if (unlikely((loff_t) (pos + count) < 0)) {
388
+ if (!unsigned_offsets(file))
389
+ return retval;
390
+ }
391
+
392
+ if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
393
+ retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
394
+ read_write == READ ? F_RDLCK : F_WRLCK);
395
+ if (retval < 0)
396
+ return retval;
397
+ }
383398 }
384399
385
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
386
- retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
387
- read_write == READ ? F_RDLCK : F_WRLCK);
388
- if (retval < 0)
389
- return retval;
390
- }
391400 return security_file_permission(file,
392401 read_write == READ ? MAY_READ : MAY_WRITE);
393402 }
....@@ -400,39 +409,69 @@
400409 ssize_t ret;
401410
402411 init_sync_kiocb(&kiocb, filp);
403
- kiocb.ki_pos = *ppos;
412
+ kiocb.ki_pos = (ppos ? *ppos : 0);
404413 iov_iter_init(&iter, READ, &iov, 1, len);
405414
406415 ret = call_read_iter(filp, &kiocb, &iter);
407416 BUG_ON(ret == -EIOCBQUEUED);
408
- *ppos = kiocb.ki_pos;
417
+ if (ppos)
418
+ *ppos = kiocb.ki_pos;
409419 return ret;
410420 }
411421
412
-ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
413
- loff_t *pos)
422
+static int warn_unsupported(struct file *file, const char *op)
414423 {
415
- if (file->f_op->read)
416
- return file->f_op->read(file, buf, count, pos);
417
- else if (file->f_op->read_iter)
418
- return new_sync_read(file, buf, count, pos);
419
- else
424
+ pr_warn_ratelimited(
425
+ "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
426
+ op, file, current->pid, current->comm);
427
+ return -EINVAL;
428
+}
429
+
430
+ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
431
+{
432
+ struct kvec iov = {
433
+ .iov_base = buf,
434
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
435
+ };
436
+ struct kiocb kiocb;
437
+ struct iov_iter iter;
438
+ ssize_t ret;
439
+
440
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
420441 return -EINVAL;
442
+ if (!(file->f_mode & FMODE_CAN_READ))
443
+ return -EINVAL;
444
+ /*
445
+ * Also fail if ->read_iter and ->read are both wired up as that
446
+ * implies very convoluted semantics.
447
+ */
448
+ if (unlikely(!file->f_op->read_iter || file->f_op->read))
449
+ return warn_unsupported(file, "read");
450
+
451
+ init_sync_kiocb(&kiocb, file);
452
+ kiocb.ki_pos = pos ? *pos : 0;
453
+ iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
454
+ ret = file->f_op->read_iter(&kiocb, &iter);
455
+ if (ret > 0) {
456
+ if (pos)
457
+ *pos = kiocb.ki_pos;
458
+ fsnotify_access(file);
459
+ add_rchar(current, ret);
460
+ }
461
+ inc_syscr(current);
462
+ return ret;
421463 }
422464
423465 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
424466 {
425
- mm_segment_t old_fs;
426
- ssize_t result;
467
+ ssize_t ret;
427468
428
- old_fs = get_fs();
429
- set_fs(get_ds());
430
- /* The cast to a user pointer is valid due to the set_fs() */
431
- result = vfs_read(file, (void __user *)buf, count, pos);
432
- set_fs(old_fs);
433
- return result;
469
+ ret = rw_verify_area(READ, file, pos, count);
470
+ if (ret)
471
+ return ret;
472
+ return __kernel_read(file, buf, count, pos);
434473 }
435
-EXPORT_SYMBOL(kernel_read);
474
+EXPORT_SYMBOL_NS(kernel_read, ANDROID_GKI_VFS_EXPORT_ONLY);
436475
437476 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
438477 {
....@@ -442,24 +481,28 @@
442481 return -EBADF;
443482 if (!(file->f_mode & FMODE_CAN_READ))
444483 return -EINVAL;
445
- if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
484
+ if (unlikely(!access_ok(buf, count)))
446485 return -EFAULT;
447486
448487 ret = rw_verify_area(READ, file, pos, count);
449
- if (!ret) {
450
- if (count > MAX_RW_COUNT)
451
- count = MAX_RW_COUNT;
452
- ret = __vfs_read(file, buf, count, pos);
453
- if (ret > 0) {
454
- fsnotify_access(file);
455
- add_rchar(current, ret);
456
- }
457
- inc_syscr(current);
458
- }
488
+ if (ret)
489
+ return ret;
490
+ if (count > MAX_RW_COUNT)
491
+ count = MAX_RW_COUNT;
459492
493
+ if (file->f_op->read)
494
+ ret = file->f_op->read(file, buf, count, pos);
495
+ else if (file->f_op->read_iter)
496
+ ret = new_sync_read(file, buf, count, pos);
497
+ else
498
+ ret = -EINVAL;
499
+ if (ret > 0) {
500
+ fsnotify_access(file);
501
+ add_rchar(current, ret);
502
+ }
503
+ inc_syscr(current);
460504 return ret;
461505 }
462
-EXPORT_SYMBOL_GPL(vfs_read);
463506
464507 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
465508 {
....@@ -469,67 +512,75 @@
469512 ssize_t ret;
470513
471514 init_sync_kiocb(&kiocb, filp);
472
- kiocb.ki_pos = *ppos;
515
+ kiocb.ki_pos = (ppos ? *ppos : 0);
473516 iov_iter_init(&iter, WRITE, &iov, 1, len);
474517
475518 ret = call_write_iter(filp, &kiocb, &iter);
476519 BUG_ON(ret == -EIOCBQUEUED);
477
- if (ret > 0)
520
+ if (ret > 0 && ppos)
478521 *ppos = kiocb.ki_pos;
479522 return ret;
480523 }
481524
482
-ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
483
- loff_t *pos)
484
-{
485
- if (file->f_op->write)
486
- return file->f_op->write(file, p, count, pos);
487
- else if (file->f_op->write_iter)
488
- return new_sync_write(file, p, count, pos);
489
- else
490
- return -EINVAL;
491
-}
492
-
525
+/* caller is responsible for file_start_write/file_end_write */
493526 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
494527 {
495
- mm_segment_t old_fs;
496
- const char __user *p;
528
+ struct kvec iov = {
529
+ .iov_base = (void *)buf,
530
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
531
+ };
532
+ struct kiocb kiocb;
533
+ struct iov_iter iter;
497534 ssize_t ret;
498535
536
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
537
+ return -EBADF;
499538 if (!(file->f_mode & FMODE_CAN_WRITE))
500539 return -EINVAL;
540
+ /*
541
+ * Also fail if ->write_iter and ->write are both wired up as that
542
+ * implies very convoluted semantics.
543
+ */
544
+ if (unlikely(!file->f_op->write_iter || file->f_op->write))
545
+ return warn_unsupported(file, "write");
501546
502
- old_fs = get_fs();
503
- set_fs(get_ds());
504
- p = (__force const char __user *)buf;
505
- if (count > MAX_RW_COUNT)
506
- count = MAX_RW_COUNT;
507
- ret = __vfs_write(file, p, count, pos);
508
- set_fs(old_fs);
547
+ init_sync_kiocb(&kiocb, file);
548
+ kiocb.ki_pos = pos ? *pos : 0;
549
+ iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
550
+ ret = file->f_op->write_iter(&kiocb, &iter);
509551 if (ret > 0) {
552
+ if (pos)
553
+ *pos = kiocb.ki_pos;
510554 fsnotify_modify(file);
511555 add_wchar(current, ret);
512556 }
513557 inc_syscw(current);
514558 return ret;
515559 }
516
-EXPORT_SYMBOL(__kernel_write);
560
+/*
561
+ * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
562
+ * but autofs is one of the few internal kernel users that actually
563
+ * wants this _and_ can be built as a module. So we need to export
564
+ * this symbol for autofs, even though it really isn't appropriate
565
+ * for any other kernel modules.
566
+ */
567
+EXPORT_SYMBOL_GPL(__kernel_write);
517568
518569 ssize_t kernel_write(struct file *file, const void *buf, size_t count,
519570 loff_t *pos)
520571 {
521
- mm_segment_t old_fs;
522
- ssize_t res;
572
+ ssize_t ret;
523573
524
- old_fs = get_fs();
525
- set_fs(get_ds());
526
- /* The cast to a user pointer is valid due to the set_fs() */
527
- res = vfs_write(file, (__force const char __user *)buf, count, pos);
528
- set_fs(old_fs);
574
+ ret = rw_verify_area(WRITE, file, pos, count);
575
+ if (ret)
576
+ return ret;
529577
530
- return res;
578
+ file_start_write(file);
579
+ ret = __kernel_write(file, buf, count, pos);
580
+ file_end_write(file);
581
+ return ret;
531582 }
532
-EXPORT_SYMBOL(kernel_write);
583
+EXPORT_SYMBOL_NS(kernel_write, ANDROID_GKI_VFS_EXPORT_ONLY);
533584
534585 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
535586 {
....@@ -539,36 +590,34 @@
539590 return -EBADF;
540591 if (!(file->f_mode & FMODE_CAN_WRITE))
541592 return -EINVAL;
542
- if (unlikely(!access_ok(VERIFY_READ, buf, count)))
593
+ if (unlikely(!access_ok(buf, count)))
543594 return -EFAULT;
544595
545596 ret = rw_verify_area(WRITE, file, pos, count);
546
- if (!ret) {
547
- if (count > MAX_RW_COUNT)
548
- count = MAX_RW_COUNT;
549
- file_start_write(file);
550
- ret = __vfs_write(file, buf, count, pos);
551
- if (ret > 0) {
552
- fsnotify_modify(file);
553
- add_wchar(current, ret);
554
- }
555
- inc_syscw(current);
556
- file_end_write(file);
597
+ if (ret)
598
+ return ret;
599
+ if (count > MAX_RW_COUNT)
600
+ count = MAX_RW_COUNT;
601
+ file_start_write(file);
602
+ if (file->f_op->write)
603
+ ret = file->f_op->write(file, buf, count, pos);
604
+ else if (file->f_op->write_iter)
605
+ ret = new_sync_write(file, buf, count, pos);
606
+ else
607
+ ret = -EINVAL;
608
+ if (ret > 0) {
609
+ fsnotify_modify(file);
610
+ add_wchar(current, ret);
557611 }
558
-
612
+ inc_syscw(current);
613
+ file_end_write(file);
559614 return ret;
560615 }
561
-EXPORT_SYMBOL_GPL(vfs_write);
562616
563
-static inline loff_t file_pos_read(struct file *file)
617
+/* file_ppos returns &file->f_pos or NULL if file is stream */
618
+static inline loff_t *file_ppos(struct file *file)
564619 {
565
- return file->f_mode & FMODE_STREAM ? 0 : file->f_pos;
566
-}
567
-
568
-static inline void file_pos_write(struct file *file, loff_t pos)
569
-{
570
- if ((file->f_mode & FMODE_STREAM) == 0)
571
- file->f_pos = pos;
620
+ return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
572621 }
573622
574623 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
....@@ -577,10 +626,14 @@
577626 ssize_t ret = -EBADF;
578627
579628 if (f.file) {
580
- loff_t pos = file_pos_read(f.file);
581
- ret = vfs_read(f.file, buf, count, &pos);
582
- if (ret >= 0)
583
- file_pos_write(f.file, pos);
629
+ loff_t pos, *ppos = file_ppos(f.file);
630
+ if (ppos) {
631
+ pos = *ppos;
632
+ ppos = &pos;
633
+ }
634
+ ret = vfs_read(f.file, buf, count, ppos);
635
+ if (ret >= 0 && ppos)
636
+ f.file->f_pos = pos;
584637 fdput_pos(f);
585638 }
586639 return ret;
....@@ -597,10 +650,14 @@
597650 ssize_t ret = -EBADF;
598651
599652 if (f.file) {
600
- loff_t pos = file_pos_read(f.file);
601
- ret = vfs_write(f.file, buf, count, &pos);
602
- if (ret >= 0)
603
- file_pos_write(f.file, pos);
653
+ loff_t pos, *ppos = file_ppos(f.file);
654
+ if (ppos) {
655
+ pos = *ppos;
656
+ ppos = &pos;
657
+ }
658
+ ret = vfs_write(f.file, buf, count, ppos);
659
+ if (ret >= 0 && ppos)
660
+ f.file->f_pos = pos;
604661 fdput_pos(f);
605662 }
606663
....@@ -675,14 +732,15 @@
675732 ret = kiocb_set_rw_flags(&kiocb, flags);
676733 if (ret)
677734 return ret;
678
- kiocb.ki_pos = *ppos;
735
+ kiocb.ki_pos = (ppos ? *ppos : 0);
679736
680737 if (type == READ)
681738 ret = call_read_iter(filp, &kiocb, iter);
682739 else
683740 ret = call_write_iter(filp, &kiocb, iter);
684741 BUG_ON(ret == -EIOCBQUEUED);
685
- *ppos = kiocb.ki_pos;
742
+ if (ppos)
743
+ *ppos = kiocb.ki_pos;
686744 return ret;
687745 }
688746
....@@ -721,188 +779,6 @@
721779 return ret;
722780 }
723781
724
-/* A write operation does a read from user space and vice versa */
725
-#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
726
-
727
-/**
728
- * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
729
- * into the kernel and check that it is valid.
730
- *
731
- * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
732
- * @uvector: Pointer to the userspace array.
733
- * @nr_segs: Number of elements in userspace array.
734
- * @fast_segs: Number of elements in @fast_pointer.
735
- * @fast_pointer: Pointer to (usually small on-stack) kernel array.
736
- * @ret_pointer: (output parameter) Pointer to a variable that will point to
737
- * either @fast_pointer, a newly allocated kernel array, or NULL,
738
- * depending on which array was used.
739
- *
740
- * This function copies an array of &struct iovec of @nr_segs from
741
- * userspace into the kernel and checks that each element is valid (e.g.
742
- * it does not point to a kernel address or cause overflow by being too
743
- * large, etc.).
744
- *
745
- * As an optimization, the caller may provide a pointer to a small
746
- * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
747
- * (the size of this array, or 0 if unused, should be given in @fast_segs).
748
- *
749
- * @ret_pointer will always point to the array that was used, so the
750
- * caller must take care not to call kfree() on it e.g. in case the
751
- * @fast_pointer array was used and it was allocated on the stack.
752
- *
753
- * Return: The total number of bytes covered by the iovec array on success
754
- * or a negative error code on error.
755
- */
756
-ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
757
- unsigned long nr_segs, unsigned long fast_segs,
758
- struct iovec *fast_pointer,
759
- struct iovec **ret_pointer)
760
-{
761
- unsigned long seg;
762
- ssize_t ret;
763
- struct iovec *iov = fast_pointer;
764
-
765
- /*
766
- * SuS says "The readv() function *may* fail if the iovcnt argument
767
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
768
- * traditionally returned zero for zero segments, so...
769
- */
770
- if (nr_segs == 0) {
771
- ret = 0;
772
- goto out;
773
- }
774
-
775
- /*
776
- * First get the "struct iovec" from user memory and
777
- * verify all the pointers
778
- */
779
- if (nr_segs > UIO_MAXIOV) {
780
- ret = -EINVAL;
781
- goto out;
782
- }
783
- if (nr_segs > fast_segs) {
784
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
785
- if (iov == NULL) {
786
- ret = -ENOMEM;
787
- goto out;
788
- }
789
- }
790
- if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
791
- ret = -EFAULT;
792
- goto out;
793
- }
794
-
795
- /*
796
- * According to the Single Unix Specification we should return EINVAL
797
- * if an element length is < 0 when cast to ssize_t or if the
798
- * total length would overflow the ssize_t return value of the
799
- * system call.
800
- *
801
- * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
802
- * overflow case.
803
- */
804
- ret = 0;
805
- for (seg = 0; seg < nr_segs; seg++) {
806
- void __user *buf = iov[seg].iov_base;
807
- ssize_t len = (ssize_t)iov[seg].iov_len;
808
-
809
- /* see if we we're about to use an invalid len or if
810
- * it's about to overflow ssize_t */
811
- if (len < 0) {
812
- ret = -EINVAL;
813
- goto out;
814
- }
815
- if (type >= 0
816
- && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
817
- ret = -EFAULT;
818
- goto out;
819
- }
820
- if (len > MAX_RW_COUNT - ret) {
821
- len = MAX_RW_COUNT - ret;
822
- iov[seg].iov_len = len;
823
- }
824
- ret += len;
825
- }
826
-out:
827
- *ret_pointer = iov;
828
- return ret;
829
-}
830
-
831
-#ifdef CONFIG_COMPAT
832
-ssize_t compat_rw_copy_check_uvector(int type,
833
- const struct compat_iovec __user *uvector, unsigned long nr_segs,
834
- unsigned long fast_segs, struct iovec *fast_pointer,
835
- struct iovec **ret_pointer)
836
-{
837
- compat_ssize_t tot_len;
838
- struct iovec *iov = *ret_pointer = fast_pointer;
839
- ssize_t ret = 0;
840
- int seg;
841
-
842
- /*
843
- * SuS says "The readv() function *may* fail if the iovcnt argument
844
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
845
- * traditionally returned zero for zero segments, so...
846
- */
847
- if (nr_segs == 0)
848
- goto out;
849
-
850
- ret = -EINVAL;
851
- if (nr_segs > UIO_MAXIOV)
852
- goto out;
853
- if (nr_segs > fast_segs) {
854
- ret = -ENOMEM;
855
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
856
- if (iov == NULL)
857
- goto out;
858
- }
859
- *ret_pointer = iov;
860
-
861
- ret = -EFAULT;
862
- if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
863
- goto out;
864
-
865
- /*
866
- * Single unix specification:
867
- * We should -EINVAL if an element length is not >= 0 and fitting an
868
- * ssize_t.
869
- *
870
- * In Linux, the total length is limited to MAX_RW_COUNT, there is
871
- * no overflow possibility.
872
- */
873
- tot_len = 0;
874
- ret = -EINVAL;
875
- for (seg = 0; seg < nr_segs; seg++) {
876
- compat_uptr_t buf;
877
- compat_ssize_t len;
878
-
879
- if (__get_user(len, &uvector->iov_len) ||
880
- __get_user(buf, &uvector->iov_base)) {
881
- ret = -EFAULT;
882
- goto out;
883
- }
884
- if (len < 0) /* size_t not fitting in compat_ssize_t .. */
885
- goto out;
886
- if (type >= 0 &&
887
- !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
888
- ret = -EFAULT;
889
- goto out;
890
- }
891
- if (len > MAX_RW_COUNT - tot_len)
892
- len = MAX_RW_COUNT - tot_len;
893
- tot_len += len;
894
- iov->iov_base = compat_ptr(buf);
895
- iov->iov_len = (compat_size_t) len;
896
- uvector++;
897
- iov++;
898
- }
899
- ret = tot_len;
900
-
901
-out:
902
- return ret;
903
-}
904
-#endif
905
-
906782 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
907783 loff_t *pos, rwf_t flags)
908784 {
....@@ -930,6 +806,34 @@
930806 fsnotify_access(file);
931807 return ret;
932808 }
809
+
810
+ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
811
+ struct iov_iter *iter)
812
+{
813
+ size_t tot_len;
814
+ ssize_t ret = 0;
815
+
816
+ if (!file->f_op->read_iter)
817
+ return -EINVAL;
818
+ if (!(file->f_mode & FMODE_READ))
819
+ return -EBADF;
820
+ if (!(file->f_mode & FMODE_CAN_READ))
821
+ return -EINVAL;
822
+
823
+ tot_len = iov_iter_count(iter);
824
+ if (!tot_len)
825
+ goto out;
826
+ ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
827
+ if (ret < 0)
828
+ return ret;
829
+
830
+ ret = call_read_iter(file, iocb, iter);
831
+out:
832
+ if (ret >= 0)
833
+ fsnotify_access(file);
834
+ return ret;
835
+}
836
+EXPORT_SYMBOL(vfs_iocb_iter_read);
933837
934838 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
935839 rwf_t flags)
....@@ -967,6 +871,34 @@
967871 return ret;
968872 }
969873
874
+ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
875
+ struct iov_iter *iter)
876
+{
877
+ size_t tot_len;
878
+ ssize_t ret = 0;
879
+
880
+ if (!file->f_op->write_iter)
881
+ return -EINVAL;
882
+ if (!(file->f_mode & FMODE_WRITE))
883
+ return -EBADF;
884
+ if (!(file->f_mode & FMODE_CAN_WRITE))
885
+ return -EINVAL;
886
+
887
+ tot_len = iov_iter_count(iter);
888
+ if (!tot_len)
889
+ return 0;
890
+ ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
891
+ if (ret < 0)
892
+ return ret;
893
+
894
+ ret = call_write_iter(file, iocb, iter);
895
+ if (ret > 0)
896
+ fsnotify_modify(file);
897
+
898
+ return ret;
899
+}
900
+EXPORT_SYMBOL(vfs_iocb_iter_write);
901
+
970902 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
971903 rwf_t flags)
972904 {
....@@ -976,7 +908,7 @@
976908 }
977909 EXPORT_SYMBOL(vfs_iter_write);
978910
979
-ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
911
+static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
980912 unsigned long vlen, loff_t *pos, rwf_t flags)
981913 {
982914 struct iovec iovstack[UIO_FASTIOV];
....@@ -1018,10 +950,14 @@
1018950 ssize_t ret = -EBADF;
1019951
1020952 if (f.file) {
1021
- loff_t pos = file_pos_read(f.file);
1022
- ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1023
- if (ret >= 0)
1024
- file_pos_write(f.file, pos);
953
+ loff_t pos, *ppos = file_ppos(f.file);
954
+ if (ppos) {
955
+ pos = *ppos;
956
+ ppos = &pos;
957
+ }
958
+ ret = vfs_readv(f.file, vec, vlen, ppos, flags);
959
+ if (ret >= 0 && ppos)
960
+ f.file->f_pos = pos;
1025961 fdput_pos(f);
1026962 }
1027963
....@@ -1038,10 +974,14 @@
1038974 ssize_t ret = -EBADF;
1039975
1040976 if (f.file) {
1041
- loff_t pos = file_pos_read(f.file);
1042
- ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1043
- if (ret >= 0)
1044
- file_pos_write(f.file, pos);
977
+ loff_t pos, *ppos = file_ppos(f.file);
978
+ if (ppos) {
979
+ pos = *ppos;
980
+ ppos = &pos;
981
+ }
982
+ ret = vfs_writev(f.file, vec, vlen, ppos, flags);
983
+ if (ret >= 0 && ppos)
984
+ f.file->f_pos = pos;
1045985 fdput_pos(f);
1046986 }
1047987
....@@ -1155,224 +1095,93 @@
11551095 return do_pwritev(fd, vec, vlen, pos, flags);
11561096 }
11571097
1098
+/*
1099
+ * Various compat syscalls. Note that they all pretend to take a native
1100
+ * iovec - import_iovec will properly treat those as compat_iovecs based on
1101
+ * in_compat_syscall().
1102
+ */
11581103 #ifdef CONFIG_COMPAT
1159
-static size_t compat_readv(struct file *file,
1160
- const struct compat_iovec __user *vec,
1161
- unsigned long vlen, loff_t *pos, rwf_t flags)
1162
-{
1163
- struct iovec iovstack[UIO_FASTIOV];
1164
- struct iovec *iov = iovstack;
1165
- struct iov_iter iter;
1166
- ssize_t ret;
1167
-
1168
- ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1169
- if (ret >= 0) {
1170
- ret = do_iter_read(file, &iter, pos, flags);
1171
- kfree(iov);
1172
- }
1173
- if (ret > 0)
1174
- add_rchar(current, ret);
1175
- inc_syscr(current);
1176
- return ret;
1177
-}
1178
-
1179
-static size_t do_compat_readv(compat_ulong_t fd,
1180
- const struct compat_iovec __user *vec,
1181
- compat_ulong_t vlen, rwf_t flags)
1182
-{
1183
- struct fd f = fdget_pos(fd);
1184
- ssize_t ret;
1185
- loff_t pos;
1186
-
1187
- if (!f.file)
1188
- return -EBADF;
1189
- pos = f.file->f_pos;
1190
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
1191
- if (ret >= 0)
1192
- f.file->f_pos = pos;
1193
- fdput_pos(f);
1194
- return ret;
1195
-
1196
-}
1197
-
1198
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1199
- const struct compat_iovec __user *,vec,
1200
- compat_ulong_t, vlen)
1201
-{
1202
- return do_compat_readv(fd, vec, vlen, 0);
1203
-}
1204
-
1205
-static long do_compat_preadv64(unsigned long fd,
1206
- const struct compat_iovec __user *vec,
1207
- unsigned long vlen, loff_t pos, rwf_t flags)
1208
-{
1209
- struct fd f;
1210
- ssize_t ret;
1211
-
1212
- if (pos < 0)
1213
- return -EINVAL;
1214
- f = fdget(fd);
1215
- if (!f.file)
1216
- return -EBADF;
1217
- ret = -ESPIPE;
1218
- if (f.file->f_mode & FMODE_PREAD)
1219
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
1220
- fdput(f);
1221
- return ret;
1222
-}
1223
-
12241104 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
12251105 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1226
- const struct compat_iovec __user *,vec,
1106
+ const struct iovec __user *, vec,
12271107 unsigned long, vlen, loff_t, pos)
12281108 {
1229
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
1109
+ return do_preadv(fd, vec, vlen, pos, 0);
12301110 }
12311111 #endif
12321112
12331113 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1234
- const struct compat_iovec __user *,vec,
1114
+ const struct iovec __user *, vec,
12351115 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
12361116 {
12371117 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
12381118
1239
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
1119
+ return do_preadv(fd, vec, vlen, pos, 0);
12401120 }
12411121
12421122 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
12431123 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1244
- const struct compat_iovec __user *,vec,
1124
+ const struct iovec __user *, vec,
12451125 unsigned long, vlen, loff_t, pos, rwf_t, flags)
12461126 {
12471127 if (pos == -1)
1248
- return do_compat_readv(fd, vec, vlen, flags);
1249
-
1250
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
1128
+ return do_readv(fd, vec, vlen, flags);
1129
+ return do_preadv(fd, vec, vlen, pos, flags);
12511130 }
12521131 #endif
12531132
12541133 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1255
- const struct compat_iovec __user *,vec,
1134
+ const struct iovec __user *, vec,
12561135 compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
12571136 rwf_t, flags)
12581137 {
12591138 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
12601139
12611140 if (pos == -1)
1262
- return do_compat_readv(fd, vec, vlen, flags);
1263
-
1264
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
1265
-}
1266
-
1267
-static size_t compat_writev(struct file *file,
1268
- const struct compat_iovec __user *vec,
1269
- unsigned long vlen, loff_t *pos, rwf_t flags)
1270
-{
1271
- struct iovec iovstack[UIO_FASTIOV];
1272
- struct iovec *iov = iovstack;
1273
- struct iov_iter iter;
1274
- ssize_t ret;
1275
-
1276
- ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1277
- if (ret >= 0) {
1278
- file_start_write(file);
1279
- ret = do_iter_write(file, &iter, pos, flags);
1280
- file_end_write(file);
1281
- kfree(iov);
1282
- }
1283
- if (ret > 0)
1284
- add_wchar(current, ret);
1285
- inc_syscw(current);
1286
- return ret;
1287
-}
1288
-
1289
-static size_t do_compat_writev(compat_ulong_t fd,
1290
- const struct compat_iovec __user* vec,
1291
- compat_ulong_t vlen, rwf_t flags)
1292
-{
1293
- struct fd f = fdget_pos(fd);
1294
- ssize_t ret;
1295
- loff_t pos;
1296
-
1297
- if (!f.file)
1298
- return -EBADF;
1299
- pos = f.file->f_pos;
1300
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
1301
- if (ret >= 0)
1302
- f.file->f_pos = pos;
1303
- fdput_pos(f);
1304
- return ret;
1305
-}
1306
-
1307
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1308
- const struct compat_iovec __user *, vec,
1309
- compat_ulong_t, vlen)
1310
-{
1311
- return do_compat_writev(fd, vec, vlen, 0);
1312
-}
1313
-
1314
-static long do_compat_pwritev64(unsigned long fd,
1315
- const struct compat_iovec __user *vec,
1316
- unsigned long vlen, loff_t pos, rwf_t flags)
1317
-{
1318
- struct fd f;
1319
- ssize_t ret;
1320
-
1321
- if (pos < 0)
1322
- return -EINVAL;
1323
- f = fdget(fd);
1324
- if (!f.file)
1325
- return -EBADF;
1326
- ret = -ESPIPE;
1327
- if (f.file->f_mode & FMODE_PWRITE)
1328
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
1329
- fdput(f);
1330
- return ret;
1141
+ return do_readv(fd, vec, vlen, flags);
1142
+ return do_preadv(fd, vec, vlen, pos, flags);
13311143 }
13321144
13331145 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
13341146 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1335
- const struct compat_iovec __user *,vec,
1147
+ const struct iovec __user *, vec,
13361148 unsigned long, vlen, loff_t, pos)
13371149 {
1338
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1150
+ return do_pwritev(fd, vec, vlen, pos, 0);
13391151 }
13401152 #endif
13411153
13421154 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1343
- const struct compat_iovec __user *,vec,
1155
+ const struct iovec __user *,vec,
13441156 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
13451157 {
13461158 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
13471159
1348
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1160
+ return do_pwritev(fd, vec, vlen, pos, 0);
13491161 }
13501162
13511163 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
13521164 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1353
- const struct compat_iovec __user *,vec,
1165
+ const struct iovec __user *, vec,
13541166 unsigned long, vlen, loff_t, pos, rwf_t, flags)
13551167 {
13561168 if (pos == -1)
1357
- return do_compat_writev(fd, vec, vlen, flags);
1358
-
1359
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1169
+ return do_writev(fd, vec, vlen, flags);
1170
+ return do_pwritev(fd, vec, vlen, pos, flags);
13601171 }
13611172 #endif
13621173
13631174 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1364
- const struct compat_iovec __user *,vec,
1175
+ const struct iovec __user *,vec,
13651176 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
13661177 {
13671178 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
13681179
13691180 if (pos == -1)
1370
- return do_compat_writev(fd, vec, vlen, flags);
1371
-
1372
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1181
+ return do_writev(fd, vec, vlen, flags);
1182
+ return do_pwritev(fd, vec, vlen, pos, flags);
13731183 }
1374
-
1375
-#endif
1184
+#endif /* CONFIG_COMPAT */
13761185
13771186 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
13781187 size_t count, loff_t max)
....@@ -1416,7 +1225,6 @@
14161225 goto fput_in;
14171226 if (!(out.file->f_mode & FMODE_WRITE))
14181227 goto fput_out;
1419
- retval = -EINVAL;
14201228 in_inode = file_inode(in.file);
14211229 out_inode = file_inode(out.file);
14221230 out_pos = out.file->f_pos;
....@@ -1550,6 +1358,109 @@
15501358 }
15511359 #endif
15521360
1361
+/**
1362
+ * generic_copy_file_range - copy data between two files
1363
+ * @file_in: file structure to read from
1364
+ * @pos_in: file offset to read from
1365
+ * @file_out: file structure to write data to
1366
+ * @pos_out: file offset to write data to
1367
+ * @len: amount of data to copy
1368
+ * @flags: copy flags
1369
+ *
1370
+ * This is a generic filesystem helper to copy data from one file to another.
1371
+ * It has no constraints on the source or destination file owners - the files
1372
+ * can belong to different superblocks and different filesystem types. Short
1373
+ * copies are allowed.
1374
+ *
1375
+ * This should be called from the @file_out filesystem, as per the
1376
+ * ->copy_file_range() method.
1377
+ *
1378
+ * Returns the number of bytes copied or a negative error indicating the
1379
+ * failure.
1380
+ */
1381
+
1382
+ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1383
+ struct file *file_out, loff_t pos_out,
1384
+ size_t len, unsigned int flags)
1385
+{
1386
+ return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1387
+ len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1388
+}
1389
+EXPORT_SYMBOL(generic_copy_file_range);
1390
+
1391
+/*
1392
+ * Performs necessary checks before doing a file copy
1393
+ *
1394
+ * Can adjust amount of bytes to copy via @req_count argument.
1395
+ * Returns appropriate error code that caller should return or
1396
+ * zero in case the copy should be allowed.
1397
+ */
1398
+static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
1399
+ struct file *file_out, loff_t pos_out,
1400
+ size_t *req_count, unsigned int flags)
1401
+{
1402
+ struct inode *inode_in = file_inode(file_in);
1403
+ struct inode *inode_out = file_inode(file_out);
1404
+ uint64_t count = *req_count;
1405
+ loff_t size_in;
1406
+ int ret;
1407
+
1408
+ ret = generic_file_rw_checks(file_in, file_out);
1409
+ if (ret)
1410
+ return ret;
1411
+
1412
+ /*
1413
+ * We allow some filesystems to handle cross sb copy, but passing
1414
+ * a file of the wrong filesystem type to filesystem driver can result
1415
+ * in an attempt to dereference the wrong type of ->private_data, so
1416
+ * avoid doing that until we really have a good reason.
1417
+ *
1418
+ * nfs and cifs define several different file_system_type structures
1419
+ * and several different sets of file_operations, but they all end up
1420
+ * using the same ->copy_file_range() function pointer.
1421
+ */
1422
+ if (flags & COPY_FILE_SPLICE) {
1423
+ /* cross sb splice is allowed */
1424
+ } else if (file_out->f_op->copy_file_range) {
1425
+ if (file_in->f_op->copy_file_range !=
1426
+ file_out->f_op->copy_file_range)
1427
+ return -EXDEV;
1428
+ } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
1429
+ return -EXDEV;
1430
+ }
1431
+
1432
+ /* Don't touch certain kinds of inodes */
1433
+ if (IS_IMMUTABLE(inode_out))
1434
+ return -EPERM;
1435
+
1436
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1437
+ return -ETXTBSY;
1438
+
1439
+ /* Ensure offsets don't wrap. */
1440
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
1441
+ return -EOVERFLOW;
1442
+
1443
+ /* Shorten the copy to EOF */
1444
+ size_in = i_size_read(inode_in);
1445
+ if (pos_in >= size_in)
1446
+ count = 0;
1447
+ else
1448
+ count = min(count, size_in - (uint64_t)pos_in);
1449
+
1450
+ ret = generic_write_check_limits(file_out, pos_out, &count);
1451
+ if (ret)
1452
+ return ret;
1453
+
1454
+ /* Don't allow overlapped copying within the same file. */
1455
+ if (inode_in == inode_out &&
1456
+ pos_out + count > pos_in &&
1457
+ pos_out < pos_in + count)
1458
+ return -EINVAL;
1459
+
1460
+ *req_count = count;
1461
+ return 0;
1462
+}
1463
+
15531464 /*
15541465 * copy_file_range() differs from regular file read and write in that it
15551466 * specifically allows return partial success. When it does so is up to
....@@ -1559,17 +1470,16 @@
15591470 struct file *file_out, loff_t pos_out,
15601471 size_t len, unsigned int flags)
15611472 {
1562
- struct inode *inode_in = file_inode(file_in);
1563
- struct inode *inode_out = file_inode(file_out);
15641473 ssize_t ret;
1474
+ bool splice = flags & COPY_FILE_SPLICE;
15651475
1566
- if (flags != 0)
1476
+ if (flags & ~COPY_FILE_SPLICE)
15671477 return -EINVAL;
15681478
1569
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1570
- return -EISDIR;
1571
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1572
- return -EINVAL;
1479
+ ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1480
+ flags);
1481
+ if (unlikely(ret))
1482
+ return ret;
15731483
15741484 ret = rw_verify_area(READ, file_in, &pos_in, len);
15751485 if (unlikely(ret))
....@@ -1579,42 +1489,48 @@
15791489 if (unlikely(ret))
15801490 return ret;
15811491
1582
- if (!(file_in->f_mode & FMODE_READ) ||
1583
- !(file_out->f_mode & FMODE_WRITE) ||
1584
- (file_out->f_flags & O_APPEND))
1585
- return -EBADF;
1586
-
1587
- /* this could be relaxed once a method supports cross-fs copies */
1588
- if (inode_in->i_sb != inode_out->i_sb)
1589
- return -EXDEV;
1590
-
15911492 if (len == 0)
15921493 return 0;
15931494
15941495 file_start_write(file_out);
15951496
15961497 /*
1597
- * Try cloning first, this is supported by more file systems, and
1598
- * more efficient if both clone and copy are supported (e.g. NFS).
1498
+ * Cloning is supported by more file systems, so we implement copy on
1499
+ * same sb using clone, but for filesystems where both clone and copy
1500
+ * are supported (e.g. nfs,cifs), we only call the copy method.
15991501 */
1600
- if (file_in->f_op->clone_file_range) {
1601
- ret = file_in->f_op->clone_file_range(file_in, pos_in,
1602
- file_out, pos_out, len);
1603
- if (ret == 0) {
1604
- ret = len;
1605
- goto done;
1606
- }
1502
+ if (!splice && file_out->f_op->copy_file_range) {
1503
+ ret = file_out->f_op->copy_file_range(file_in, pos_in,
1504
+ file_out, pos_out,
1505
+ len, flags);
1506
+ goto done;
16071507 }
16081508
1609
- if (file_out->f_op->copy_file_range) {
1610
- ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1611
- pos_out, len, flags);
1612
- if (ret != -EOPNOTSUPP)
1509
+ if (!splice && file_in->f_op->remap_file_range &&
1510
+ file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1511
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
1512
+ file_out, pos_out,
1513
+ min_t(loff_t, MAX_RW_COUNT, len),
1514
+ REMAP_FILE_CAN_SHORTEN);
1515
+ if (ret > 0)
16131516 goto done;
16141517 }
16151518
1616
- ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1617
- len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1519
+ /*
1520
+ * We can get here for same sb copy of filesystems that do not implement
1521
+ * ->copy_file_range() in case filesystem does not support clone or in
1522
+ * case filesystem supports clone but rejected the clone request (e.g.
1523
+ * because it was not block aligned).
1524
+ *
1525
+ * In both cases, fall back to kernel copy so we are able to maintain a
1526
+ * consistent story about which filesystems support copy_file_range()
1527
+ * and which filesystems do not, that will allow userspace tools to
1528
+ * make consistent desicions w.r.t using copy_file_range().
1529
+ *
1530
+ * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
1531
+ */
1532
+ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1533
+ flags);
16181534
16191535 done:
16201536 if (ret > 0) {
....@@ -1666,6 +1582,10 @@
16661582 pos_out = f_out.file->f_pos;
16671583 }
16681584
1585
+ ret = -EINVAL;
1586
+ if (flags != 0)
1587
+ goto out;
1588
+
16691589 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
16701590 flags);
16711591 if (ret > 0) {
....@@ -1695,477 +1615,92 @@
16951615 return ret;
16961616 }
16971617
1698
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1618
+/*
1619
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
1620
+ * LFS limits. If pos is under the limit it becomes a short access. If it
1621
+ * exceeds the limit we return -EFBIG.
1622
+ */
1623
+int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
16991624 {
1700
- struct inode *inode = file_inode(file);
1625
+ struct inode *inode = file->f_mapping->host;
1626
+ loff_t max_size = inode->i_sb->s_maxbytes;
1627
+ loff_t limit = rlimit(RLIMIT_FSIZE);
17011628
1702
- if (unlikely(pos < 0))
1703
- return -EINVAL;
1704
-
1705
- if (unlikely((loff_t) (pos + len) < 0))
1706
- return -EINVAL;
1707
-
1708
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1709
- loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1710
- int retval;
1711
-
1712
- retval = locks_mandatory_area(inode, file, pos, end,
1713
- write ? F_WRLCK : F_RDLCK);
1714
- if (retval < 0)
1715
- return retval;
1629
+ if (limit != RLIM_INFINITY) {
1630
+ if (pos >= limit) {
1631
+ send_sig(SIGXFSZ, current, 0);
1632
+ return -EFBIG;
1633
+ }
1634
+ *count = min(*count, limit - pos);
17161635 }
17171636
1718
- return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1719
-}
1720
-/*
1721
- * Ensure that we don't remap a partial EOF block in the middle of something
1722
- * else. Assume that the offsets have already been checked for block
1723
- * alignment.
1724
- *
1725
- * For deduplication we always scale down to the previous block because we
1726
- * can't meaningfully compare post-EOF contents.
1727
- *
1728
- * For clone we only link a partial EOF block above the destination file's EOF.
1729
- */
1730
-static int generic_remap_check_len(struct inode *inode_in,
1731
- struct inode *inode_out,
1732
- loff_t pos_out,
1733
- u64 *len,
1734
- bool is_dedupe)
1735
-{
1736
- u64 blkmask = i_blocksize(inode_in) - 1;
1637
+ if (!(file->f_flags & O_LARGEFILE))
1638
+ max_size = MAX_NON_LFS;
17371639
1738
- if ((*len & blkmask) == 0)
1739
- return 0;
1640
+ if (unlikely(pos >= max_size))
1641
+ return -EFBIG;
17401642
1741
- if (is_dedupe)
1742
- *len &= ~blkmask;
1743
- else if (pos_out + *len < i_size_read(inode_out))
1744
- return -EINVAL;
1643
+ *count = min(*count, max_size - pos);
17451644
17461645 return 0;
17471646 }
17481647
17491648 /*
1750
- * Check that the two inodes are eligible for cloning, the ranges make
1751
- * sense, and then flush all dirty data. Caller must ensure that the
1752
- * inodes have been locked against any other modifications.
1649
+ * Performs necessary checks before doing a write
17531650 *
1754
- * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1755
- * the usual negative error code.
1651
+ * Can adjust writing position or amount of bytes to write.
1652
+ * Returns appropriate error code that caller should return or
1653
+ * zero in case that write should be allowed.
17561654 */
1757
-int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1758
- struct inode *inode_out, loff_t pos_out,
1759
- u64 *len, bool is_dedupe)
1655
+ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
17601656 {
1761
- loff_t bs = inode_out->i_sb->s_blocksize;
1762
- loff_t blen;
1763
- loff_t isize;
1764
- bool same_inode = (inode_in == inode_out);
1657
+ struct file *file = iocb->ki_filp;
1658
+ struct inode *inode = file->f_mapping->host;
1659
+ loff_t count;
17651660 int ret;
17661661
1767
- /* Don't touch certain kinds of inodes */
1768
- if (IS_IMMUTABLE(inode_out))
1769
- return -EPERM;
1770
-
1771
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1662
+ if (IS_SWAPFILE(inode))
17721663 return -ETXTBSY;
17731664
1774
- /* Don't reflink dirs, pipes, sockets... */
1775
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1776
- return -EISDIR;
1777
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1778
- return -EINVAL;
1779
-
1780
- /* Are we going all the way to the end? */
1781
- isize = i_size_read(inode_in);
1782
- if (isize == 0)
1665
+ if (!iov_iter_count(from))
17831666 return 0;
17841667
1785
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
1786
- if (*len == 0) {
1787
- if (is_dedupe || pos_in == isize)
1788
- return 0;
1789
- if (pos_in > isize)
1790
- return -EINVAL;
1791
- *len = isize - pos_in;
1792
- }
1668
+ /* FIXME: this is for backwards compatibility with 2.4 */
1669
+ if (iocb->ki_flags & IOCB_APPEND)
1670
+ iocb->ki_pos = i_size_read(inode);
17931671
1794
- /* Ensure offsets don't wrap and the input is inside i_size */
1795
- if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1796
- pos_in + *len > isize)
1672
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
17971673 return -EINVAL;
17981674
1799
- /* Don't allow dedupe past EOF in the dest file */
1800
- if (is_dedupe) {
1801
- loff_t disize;
1802
-
1803
- disize = i_size_read(inode_out);
1804
- if (pos_out >= disize || pos_out + *len > disize)
1805
- return -EINVAL;
1806
- }
1807
-
1808
- /* If we're linking to EOF, continue to the block boundary. */
1809
- if (pos_in + *len == isize)
1810
- blen = ALIGN(isize, bs) - pos_in;
1811
- else
1812
- blen = *len;
1813
-
1814
- /* Only reflink if we're aligned to block boundaries */
1815
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1816
- !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1817
- return -EINVAL;
1818
-
1819
- /* Don't allow overlapped reflink within the same file */
1820
- if (same_inode) {
1821
- if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1822
- return -EINVAL;
1823
- }
1824
-
1825
- /* Wait for the completion of any pending IOs on both files */
1826
- inode_dio_wait(inode_in);
1827
- if (!same_inode)
1828
- inode_dio_wait(inode_out);
1829
-
1830
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
1831
- pos_in, pos_in + *len - 1);
1675
+ count = iov_iter_count(from);
1676
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
18321677 if (ret)
18331678 return ret;
18341679
1835
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
1836
- pos_out, pos_out + *len - 1);
1837
- if (ret)
1838
- return ret;
1839
-
1840
- /*
1841
- * Check that the extents are the same.
1842
- */
1843
- if (is_dedupe) {
1844
- bool is_same = false;
1845
-
1846
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1847
- inode_out, pos_out, *len, &is_same);
1848
- if (ret)
1849
- return ret;
1850
- if (!is_same)
1851
- return -EBADE;
1852
- }
1853
-
1854
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
1855
- is_dedupe);
1856
- if (ret)
1857
- return ret;
1858
-
1859
- return 1;
1680
+ iov_iter_truncate(from, count);
1681
+ return iov_iter_count(from);
18601682 }
1861
-EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1683
+EXPORT_SYMBOL(generic_write_checks);
18621684
1863
-int do_clone_file_range(struct file *file_in, loff_t pos_in,
1864
- struct file *file_out, loff_t pos_out, u64 len)
1685
+/*
1686
+ * Performs common checks before doing a file copy/clone
1687
+ * from @file_in to @file_out.
1688
+ */
1689
+int generic_file_rw_checks(struct file *file_in, struct file *file_out)
18651690 {
18661691 struct inode *inode_in = file_inode(file_in);
18671692 struct inode *inode_out = file_inode(file_out);
1868
- int ret;
18691693
1694
+ /* Don't copy dirs, pipes, sockets... */
18701695 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
18711696 return -EISDIR;
18721697 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
18731698 return -EINVAL;
1874
-
1875
- /*
1876
- * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1877
- * the same mount. Practically, they only need to be on the same file
1878
- * system.
1879
- */
1880
- if (inode_in->i_sb != inode_out->i_sb)
1881
- return -EXDEV;
18821699
18831700 if (!(file_in->f_mode & FMODE_READ) ||
18841701 !(file_out->f_mode & FMODE_WRITE) ||
18851702 (file_out->f_flags & O_APPEND))
18861703 return -EBADF;
18871704
1888
- if (!file_in->f_op->clone_file_range)
1889
- return -EOPNOTSUPP;
1890
-
1891
- ret = clone_verify_area(file_in, pos_in, len, false);
1892
- if (ret)
1893
- return ret;
1894
-
1895
- ret = clone_verify_area(file_out, pos_out, len, true);
1896
- if (ret)
1897
- return ret;
1898
-
1899
- if (pos_in + len > i_size_read(inode_in))
1900
- return -EINVAL;
1901
-
1902
- ret = file_in->f_op->clone_file_range(file_in, pos_in,
1903
- file_out, pos_out, len);
1904
- if (!ret) {
1905
- fsnotify_access(file_in);
1906
- fsnotify_modify(file_out);
1907
- }
1908
-
1909
- return ret;
1910
-}
1911
-EXPORT_SYMBOL(do_clone_file_range);
1912
-
1913
-int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1914
- struct file *file_out, loff_t pos_out, u64 len)
1915
-{
1916
- int ret;
1917
-
1918
- file_start_write(file_out);
1919
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
1920
- file_end_write(file_out);
1921
-
1922
- return ret;
1923
-}
1924
-EXPORT_SYMBOL(vfs_clone_file_range);
1925
-
1926
-/* Read a page's worth of file data into the page cache. */
1927
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1928
-{
1929
- struct address_space *mapping;
1930
- struct page *page;
1931
- pgoff_t n;
1932
-
1933
- n = offset >> PAGE_SHIFT;
1934
- mapping = inode->i_mapping;
1935
- page = read_mapping_page(mapping, n, NULL);
1936
- if (IS_ERR(page))
1937
- return page;
1938
- if (!PageUptodate(page)) {
1939
- put_page(page);
1940
- return ERR_PTR(-EIO);
1941
- }
1942
- return page;
1943
-}
1944
-
1945
-/*
1946
- * Lock two pages, ensuring that we lock in offset order if the pages are from
1947
- * the same file.
1948
- */
1949
-static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1950
-{
1951
- /* Always lock in order of increasing index. */
1952
- if (page1->index > page2->index)
1953
- swap(page1, page2);
1954
-
1955
- lock_page(page1);
1956
- if (page1 != page2)
1957
- lock_page(page2);
1958
-}
1959
-
1960
-/* Unlock two pages, being careful not to unlock the same page twice. */
1961
-static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1962
-{
1963
- unlock_page(page1);
1964
- if (page1 != page2)
1965
- unlock_page(page2);
1966
-}
1967
-
1968
-/*
1969
- * Compare extents of two files to see if they are the same.
1970
- * Caller must have locked both inodes to prevent write races.
1971
- */
1972
-int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1973
- struct inode *dest, loff_t destoff,
1974
- loff_t len, bool *is_same)
1975
-{
1976
- loff_t src_poff;
1977
- loff_t dest_poff;
1978
- void *src_addr;
1979
- void *dest_addr;
1980
- struct page *src_page;
1981
- struct page *dest_page;
1982
- loff_t cmp_len;
1983
- bool same;
1984
- int error;
1985
-
1986
- error = -EINVAL;
1987
- same = true;
1988
- while (len) {
1989
- src_poff = srcoff & (PAGE_SIZE - 1);
1990
- dest_poff = destoff & (PAGE_SIZE - 1);
1991
- cmp_len = min(PAGE_SIZE - src_poff,
1992
- PAGE_SIZE - dest_poff);
1993
- cmp_len = min(cmp_len, len);
1994
- if (cmp_len <= 0)
1995
- goto out_error;
1996
-
1997
- src_page = vfs_dedupe_get_page(src, srcoff);
1998
- if (IS_ERR(src_page)) {
1999
- error = PTR_ERR(src_page);
2000
- goto out_error;
2001
- }
2002
- dest_page = vfs_dedupe_get_page(dest, destoff);
2003
- if (IS_ERR(dest_page)) {
2004
- error = PTR_ERR(dest_page);
2005
- put_page(src_page);
2006
- goto out_error;
2007
- }
2008
-
2009
- vfs_lock_two_pages(src_page, dest_page);
2010
-
2011
- /*
2012
- * Now that we've locked both pages, make sure they're still
2013
- * mapped to the file data we're interested in. If not,
2014
- * someone is invalidating pages on us and we lose.
2015
- */
2016
- if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
2017
- src_page->mapping != src->i_mapping ||
2018
- dest_page->mapping != dest->i_mapping) {
2019
- same = false;
2020
- goto unlock;
2021
- }
2022
-
2023
- src_addr = kmap_atomic(src_page);
2024
- dest_addr = kmap_atomic(dest_page);
2025
-
2026
- flush_dcache_page(src_page);
2027
- flush_dcache_page(dest_page);
2028
-
2029
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
2030
- same = false;
2031
-
2032
- kunmap_atomic(dest_addr);
2033
- kunmap_atomic(src_addr);
2034
-unlock:
2035
- vfs_unlock_two_pages(src_page, dest_page);
2036
- put_page(dest_page);
2037
- put_page(src_page);
2038
-
2039
- if (!same)
2040
- break;
2041
-
2042
- srcoff += cmp_len;
2043
- destoff += cmp_len;
2044
- len -= cmp_len;
2045
- }
2046
-
2047
- *is_same = same;
20481705 return 0;
2049
-
2050
-out_error:
2051
- return error;
20521706 }
2053
-EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
2054
-
2055
-int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2056
- struct file *dst_file, loff_t dst_pos, u64 len)
2057
-{
2058
- s64 ret;
2059
-
2060
- ret = mnt_want_write_file(dst_file);
2061
- if (ret)
2062
- return ret;
2063
-
2064
- ret = clone_verify_area(dst_file, dst_pos, len, true);
2065
- if (ret < 0)
2066
- goto out_drop_write;
2067
-
2068
- ret = -EINVAL;
2069
- if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
2070
- goto out_drop_write;
2071
-
2072
- ret = -EXDEV;
2073
- if (src_file->f_path.mnt != dst_file->f_path.mnt)
2074
- goto out_drop_write;
2075
-
2076
- ret = -EISDIR;
2077
- if (S_ISDIR(file_inode(dst_file)->i_mode))
2078
- goto out_drop_write;
2079
-
2080
- ret = -EINVAL;
2081
- if (!dst_file->f_op->dedupe_file_range)
2082
- goto out_drop_write;
2083
-
2084
- ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
2085
- dst_file, dst_pos, len);
2086
-out_drop_write:
2087
- mnt_drop_write_file(dst_file);
2088
-
2089
- return ret;
2090
-}
2091
-EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2092
-
2093
-int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2094
-{
2095
- struct file_dedupe_range_info *info;
2096
- struct inode *src = file_inode(file);
2097
- u64 off;
2098
- u64 len;
2099
- int i;
2100
- int ret;
2101
- u16 count = same->dest_count;
2102
- int deduped;
2103
-
2104
- if (!(file->f_mode & FMODE_READ))
2105
- return -EINVAL;
2106
-
2107
- if (same->reserved1 || same->reserved2)
2108
- return -EINVAL;
2109
-
2110
- off = same->src_offset;
2111
- len = same->src_length;
2112
-
2113
- ret = -EISDIR;
2114
- if (S_ISDIR(src->i_mode))
2115
- goto out;
2116
-
2117
- ret = -EINVAL;
2118
- if (!S_ISREG(src->i_mode))
2119
- goto out;
2120
-
2121
- ret = clone_verify_area(file, off, len, false);
2122
- if (ret < 0)
2123
- goto out;
2124
- ret = 0;
2125
-
2126
- if (off + len > i_size_read(src))
2127
- return -EINVAL;
2128
-
2129
- /* Arbitrary 1G limit on a single dedupe request, can be raised. */
2130
- len = min_t(u64, len, 1 << 30);
2131
-
2132
- /* pre-format output fields to sane values */
2133
- for (i = 0; i < count; i++) {
2134
- same->info[i].bytes_deduped = 0ULL;
2135
- same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2136
- }
2137
-
2138
- for (i = 0, info = same->info; i < count; i++, info++) {
2139
- struct fd dst_fd = fdget(info->dest_fd);
2140
- struct file *dst_file = dst_fd.file;
2141
-
2142
- if (!dst_file) {
2143
- info->status = -EBADF;
2144
- goto next_loop;
2145
- }
2146
-
2147
- if (info->reserved) {
2148
- info->status = -EINVAL;
2149
- goto next_fdput;
2150
- }
2151
-
2152
- deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2153
- info->dest_offset, len);
2154
- if (deduped == -EBADE)
2155
- info->status = FILE_DEDUPE_RANGE_DIFFERS;
2156
- else if (deduped < 0)
2157
- info->status = deduped;
2158
- else
2159
- info->bytes_deduped = len;
2160
-
2161
-next_fdput:
2162
- fdput(dst_fd);
2163
-next_loop:
2164
- if (fatal_signal_pending(current))
2165
- goto out;
2166
- }
2167
-
2168
-out:
2169
- return ret;
2170
-}
2171
-EXPORT_SYMBOL(vfs_dedupe_file_range);