hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/orangefs/file.c
....@@ -1,6 +1,7 @@
11 // SPDX-License-Identifier: GPL-2.0
22 /*
33 * (C) 2001 Clemson University and The University of Chicago
4
+ * Copyright 2018 Omnibond Systems, L.L.C.
45 *
56 * See COPYING in top-level directory.
67 */
....@@ -44,15 +45,19 @@
4445 /*
4546 * Post and wait for the I/O upcall to finish
4647 */
47
-static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
48
- loff_t *offset, struct iov_iter *iter,
49
- size_t total_size, loff_t readahead_size)
48
+ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
49
+ loff_t *offset, struct iov_iter *iter, size_t total_size,
50
+ loff_t readahead_size, struct orangefs_write_range *wr,
51
+ int *index_return, struct file *file)
5052 {
5153 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
5254 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
5355 struct orangefs_kernel_op_s *new_op = NULL;
54
- int buffer_index = -1;
56
+ int buffer_index;
5557 ssize_t ret;
58
+ size_t copy_amount;
59
+ int open_for_read;
60
+ int open_for_write;
5661
5762 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
5863 if (!new_op)
....@@ -84,6 +89,42 @@
8489 new_op->upcall.req.io.buf_index = buffer_index;
8590 new_op->upcall.req.io.count = total_size;
8691 new_op->upcall.req.io.offset = *offset;
92
+ if (type == ORANGEFS_IO_WRITE && wr) {
93
+ new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
94
+ new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
95
+ }
96
+ /*
97
+ * Orangefs has no open, and orangefs checks file permissions
98
+ * on each file access. Posix requires that file permissions
99
+ * be checked on open and nowhere else. Orangefs-through-the-kernel
100
+ * needs to seem posix compliant.
101
+ *
102
+ * The VFS opens files, even if the filesystem provides no
103
+ * method. We can see if a file was successfully opened for
104
+ * read and or for write by looking at file->f_mode.
105
+ *
106
+ * When writes are flowing from the page cache, file is no
107
+ * longer available. We can trust the VFS to have checked
108
+ * file->f_mode before writing to the page cache.
109
+ *
110
+ * The mode of a file might change between when it is opened
111
+ * and IO commences, or it might be created with an arbitrary mode.
112
+ *
113
+ * We'll make sure we don't hit EACCES during the IO stage by
114
+ * using UID 0. Some of the time we have access without changing
115
+ * to UID 0 - how to check?
116
+ */
117
+ if (file) {
118
+ open_for_write = file->f_mode & FMODE_WRITE;
119
+ open_for_read = file->f_mode & FMODE_READ;
120
+ } else {
121
+ open_for_write = 1;
122
+ open_for_read = 0; /* not relevant? */
123
+ }
124
+ if ((type == ORANGEFS_IO_WRITE) && open_for_write)
125
+ new_op->upcall.uid = 0;
126
+ if ((type == ORANGEFS_IO_READ) && open_for_read)
127
+ new_op->upcall.uid = 0;
87128
88129 gossip_debug(GOSSIP_FILE_DEBUG,
89130 "%s(%pU): offset: %llu total_size: %zd\n",
....@@ -128,7 +169,6 @@
128169 */
129170 if (ret == -EAGAIN && op_state_purged(new_op)) {
130171 orangefs_bufmap_put(buffer_index);
131
- buffer_index = -1;
132172 if (type == ORANGEFS_IO_WRITE)
133173 iov_iter_revert(iter, total_size);
134174 gossip_debug(GOSSIP_FILE_DEBUG,
....@@ -168,7 +208,10 @@
168208 * trigger the write.
169209 */
170210 case OP_VFS_STATE_INPROGR:
171
- ret = total_size;
211
+ if (type == ORANGEFS_IO_READ)
212
+ ret = -EINTR;
213
+ else
214
+ ret = total_size;
172215 break;
173216 default:
174217 gossip_err("%s: unexpected op state :%d:.\n",
....@@ -204,8 +247,25 @@
204247 * can futher be kernel-space or user-space addresses.
205248 * or it can pointers to struct page's
206249 */
250
+
251
+ /*
252
+ * When reading, readahead_size will only be zero when
253
+ * we're doing O_DIRECT, otherwise we got here from
254
+ * orangefs_readpage.
255
+ *
256
+ * If we got here from orangefs_readpage we want to
257
+ * copy either a page or the whole file into the io
258
+ * vector, whichever is smaller.
259
+ */
260
+ if (readahead_size)
261
+ copy_amount =
262
+ min(new_op->downcall.resp.io.amt_complete,
263
+ (__s64)PAGE_SIZE);
264
+ else
265
+ copy_amount = new_op->downcall.resp.io.amt_complete;
266
+
207267 ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
208
- new_op->downcall.resp.io.amt_complete);
268
+ copy_amount);
209269 if (ret < 0) {
210270 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
211271 __func__, (long)ret);
....@@ -223,250 +283,112 @@
223283
224284 out:
225285 if (buffer_index >= 0) {
226
- orangefs_bufmap_put(buffer_index);
227
- gossip_debug(GOSSIP_FILE_DEBUG,
228
- "%s(%pU): PUT buffer_index %d\n",
229
- __func__, handle, buffer_index);
230
- buffer_index = -1;
286
+ if ((readahead_size) && (type == ORANGEFS_IO_READ)) {
287
+ /* readpage */
288
+ *index_return = buffer_index;
289
+ gossip_debug(GOSSIP_FILE_DEBUG,
290
+ "%s: hold on to buffer_index :%d:\n",
291
+ __func__, buffer_index);
292
+ } else {
293
+ /* O_DIRECT */
294
+ orangefs_bufmap_put(buffer_index);
295
+ gossip_debug(GOSSIP_FILE_DEBUG,
296
+ "%s(%pU): PUT buffer_index %d\n",
297
+ __func__, handle, buffer_index);
298
+ }
231299 }
232300 op_release(new_op);
233301 return ret;
234302 }
235303
236
-/*
237
- * Common entry point for read/write/readv/writev
238
- * This function will dispatch it to either the direct I/O
239
- * or buffered I/O path depending on the mount options and/or
240
- * augmented/extended metadata attached to the file.
241
- * Note: File extended attributes override any mount options.
242
- */
243
-static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
244
- loff_t *offset, struct iov_iter *iter)
304
+int orangefs_revalidate_mapping(struct inode *inode)
245305 {
246
- struct inode *inode = file->f_mapping->host;
247306 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
248
- struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
249
- size_t count = iov_iter_count(iter);
250
- ssize_t total_count = 0;
251
- ssize_t ret = -EINVAL;
307
+ struct address_space *mapping = inode->i_mapping;
308
+ unsigned long *bitlock = &orangefs_inode->bitlock;
309
+ int ret;
252310
253
- gossip_debug(GOSSIP_FILE_DEBUG,
254
- "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
255
- __func__,
256
- handle,
257
- (int)count);
258
-
259
- if (type == ORANGEFS_IO_WRITE) {
260
- gossip_debug(GOSSIP_FILE_DEBUG,
261
- "%s(%pU): proceeding with offset : %llu, "
262
- "size %d\n",
263
- __func__,
264
- handle,
265
- llu(*offset),
266
- (int)count);
267
- }
268
-
269
- if (count == 0) {
270
- ret = 0;
271
- goto out;
272
- }
273
-
274
- while (iov_iter_count(iter)) {
275
- size_t each_count = iov_iter_count(iter);
276
- size_t amt_complete;
277
-
278
- /* how much to transfer in this loop iteration */
279
- if (each_count > orangefs_bufmap_size_query())
280
- each_count = orangefs_bufmap_size_query();
281
-
282
- gossip_debug(GOSSIP_FILE_DEBUG,
283
- "%s(%pU): size of each_count(%d)\n",
284
- __func__,
285
- handle,
286
- (int)each_count);
287
- gossip_debug(GOSSIP_FILE_DEBUG,
288
- "%s(%pU): BEFORE wait_for_io: offset is %d\n",
289
- __func__,
290
- handle,
291
- (int)*offset);
292
-
293
- ret = wait_for_direct_io(type, inode, offset, iter,
294
- each_count, 0);
295
- gossip_debug(GOSSIP_FILE_DEBUG,
296
- "%s(%pU): return from wait_for_io:%d\n",
297
- __func__,
298
- handle,
299
- (int)ret);
300
-
301
- if (ret < 0)
302
- goto out;
303
-
304
- *offset += ret;
305
- total_count += ret;
306
- amt_complete = ret;
307
-
308
- gossip_debug(GOSSIP_FILE_DEBUG,
309
- "%s(%pU): AFTER wait_for_io: offset is %d\n",
310
- __func__,
311
- handle,
312
- (int)*offset);
313
-
314
- /*
315
- * if we got a short I/O operations,
316
- * fall out and return what we got so far
317
- */
318
- if (amt_complete < each_count)
311
+ while (1) {
312
+ ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
313
+ if (ret)
314
+ return ret;
315
+ spin_lock(&inode->i_lock);
316
+ if (test_bit(1, bitlock)) {
317
+ spin_unlock(&inode->i_lock);
318
+ continue;
319
+ }
320
+ if (!time_before(jiffies, orangefs_inode->mapping_time))
319321 break;
320
- } /*end while */
322
+ spin_unlock(&inode->i_lock);
323
+ return 0;
324
+ }
321325
326
+ set_bit(1, bitlock);
327
+ smp_wmb();
328
+ spin_unlock(&inode->i_lock);
329
+
330
+ unmap_mapping_range(mapping, 0, 0, 0);
331
+ ret = filemap_write_and_wait(mapping);
332
+ if (!ret)
333
+ ret = invalidate_inode_pages2(mapping);
334
+
335
+ orangefs_inode->mapping_time = jiffies +
336
+ orangefs_cache_timeout_msecs*HZ/1000;
337
+
338
+ clear_bit(1, bitlock);
339
+ smp_mb__after_atomic();
340
+ wake_up_bit(bitlock, 1);
341
+
342
+ return ret;
343
+}
344
+
345
+static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
346
+ struct iov_iter *iter)
347
+{
348
+ int ret;
349
+ orangefs_stats.reads++;
350
+
351
+ down_read(&file_inode(iocb->ki_filp)->i_rwsem);
352
+ ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
353
+ if (ret)
354
+ goto out;
355
+
356
+ ret = generic_file_read_iter(iocb, iter);
322357 out:
323
- if (total_count > 0)
324
- ret = total_count;
325
- if (ret > 0) {
326
- if (type == ORANGEFS_IO_READ) {
327
- file_accessed(file);
328
- } else {
329
- file_update_time(file);
330
- /*
331
- * Must invalidate to ensure write loop doesn't
332
- * prevent kernel from reading updated
333
- * attribute. Size probably changed because of
334
- * the write, and other clients could update
335
- * any other attribute.
336
- */
337
- orangefs_inode->getattr_time = jiffies - 1;
338
- }
339
- }
340
-
341
- gossip_debug(GOSSIP_FILE_DEBUG,
342
- "%s(%pU): Value(%d) returned.\n",
343
- __func__,
344
- handle,
345
- (int)ret);
346
-
358
+ up_read(&file_inode(iocb->ki_filp)->i_rwsem);
347359 return ret;
348360 }
349361
350
-/*
351
- * Read data from a specified offset in a file (referenced by inode).
352
- * Data may be placed either in a user or kernel buffer.
353
- */
354
-ssize_t orangefs_inode_read(struct inode *inode,
355
- struct iov_iter *iter,
356
- loff_t *offset,
357
- loff_t readahead_size)
362
+static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
363
+ struct iov_iter *iter)
358364 {
359
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
360
- size_t count = iov_iter_count(iter);
361
- size_t bufmap_size;
362
- ssize_t ret = -EINVAL;
363
-
364
- orangefs_stats.reads++;
365
-
366
- bufmap_size = orangefs_bufmap_size_query();
367
- if (count > bufmap_size) {
368
- gossip_debug(GOSSIP_FILE_DEBUG,
369
- "%s: count is too large (%zd/%zd)!\n",
370
- __func__, count, bufmap_size);
371
- return -EINVAL;
372
- }
373
-
374
- gossip_debug(GOSSIP_FILE_DEBUG,
375
- "%s(%pU) %zd@%llu\n",
376
- __func__,
377
- &orangefs_inode->refn.khandle,
378
- count,
379
- llu(*offset));
380
-
381
- ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
382
- count, readahead_size);
383
- if (ret > 0)
384
- *offset += ret;
385
-
386
- gossip_debug(GOSSIP_FILE_DEBUG,
387
- "%s(%pU): Value(%zd) returned.\n",
388
- __func__,
389
- &orangefs_inode->refn.khandle,
390
- ret);
391
-
392
- return ret;
393
-}
394
-
395
-static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
396
-{
397
- struct file *file = iocb->ki_filp;
398
- loff_t pos = iocb->ki_pos;
399
- ssize_t rc = 0;
400
-
401
- BUG_ON(iocb->private);
402
-
403
- gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
404
-
405
- orangefs_stats.reads++;
406
-
407
- rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
408
- iocb->ki_pos = pos;
409
-
410
- return rc;
411
-}
412
-
413
-static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
414
-{
415
- struct file *file = iocb->ki_filp;
416
- loff_t pos;
417
- ssize_t rc;
418
-
419
- BUG_ON(iocb->private);
420
-
421
- gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
422
-
423
- inode_lock(file->f_mapping->host);
424
-
425
- /* Make sure generic_write_checks sees an up to date inode size. */
426
- if (file->f_flags & O_APPEND) {
427
- rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
428
- STATX_SIZE);
429
- if (rc == -ESTALE)
430
- rc = -EIO;
431
- if (rc) {
432
- gossip_err("%s: orangefs_inode_getattr failed, "
433
- "rc:%zd:.\n", __func__, rc);
434
- goto out;
435
- }
436
- }
437
-
438
- rc = generic_write_checks(iocb, iter);
439
-
440
- if (rc <= 0) {
441
- gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
442
- __func__, rc);
443
- goto out;
444
- }
445
-
446
- /*
447
- * if we are appending, generic_write_checks would have updated
448
- * pos to the end of the file, so we will wait till now to set
449
- * pos...
450
- */
451
- pos = iocb->ki_pos;
452
-
453
- rc = do_readv_writev(ORANGEFS_IO_WRITE,
454
- file,
455
- &pos,
456
- iter);
457
- if (rc < 0) {
458
- gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
459
- __func__, rc);
460
- goto out;
461
- }
462
-
463
- iocb->ki_pos = pos;
365
+ int ret;
464366 orangefs_stats.writes++;
465367
466
-out:
368
+ if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
369
+ ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
370
+ if (ret)
371
+ return ret;
372
+ }
467373
468
- inode_unlock(file->f_mapping->host);
469
- return rc;
374
+ ret = generic_file_write_iter(iocb, iter);
375
+ return ret;
376
+}
377
+
378
+static int orangefs_getflags(struct inode *inode, unsigned long *uval)
379
+{
380
+ __u64 val = 0;
381
+ int ret;
382
+
383
+ ret = orangefs_inode_getxattr(inode,
384
+ "user.pvfs2.meta_hint",
385
+ &val, sizeof(val));
386
+ if (ret < 0 && ret != -ENODATA)
387
+ return ret;
388
+ else if (ret == -ENODATA)
389
+ val = 0;
390
+ *uval = val;
391
+ return 0;
470392 }
471393
472394 /*
....@@ -474,6 +396,7 @@
474396 */
475397 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
476398 {
399
+ struct inode *inode = file_inode(file);
477400 int ret = -ENOTTY;
478401 __u64 val = 0;
479402 unsigned long uval;
....@@ -487,20 +410,16 @@
487410 * and append flags
488411 */
489412 if (cmd == FS_IOC_GETFLAGS) {
490
- val = 0;
491
- ret = orangefs_inode_getxattr(file_inode(file),
492
- "user.pvfs2.meta_hint",
493
- &val, sizeof(val));
494
- if (ret < 0 && ret != -ENODATA)
413
+ ret = orangefs_getflags(inode, &uval);
414
+ if (ret)
495415 return ret;
496
- else if (ret == -ENODATA)
497
- val = 0;
498
- uval = val;
499416 gossip_debug(GOSSIP_FILE_DEBUG,
500417 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
501418 (unsigned long long)uval);
502419 return put_user(uval, (int __user *)arg);
503420 } else if (cmd == FS_IOC_SETFLAGS) {
421
+ unsigned long old_uval;
422
+
504423 ret = 0;
505424 if (get_user(uval, (int __user *)arg))
506425 return -EFAULT;
....@@ -516,11 +435,17 @@
516435 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
517436 return -EINVAL;
518437 }
438
+ ret = orangefs_getflags(inode, &old_uval);
439
+ if (ret)
440
+ return ret;
441
+ ret = vfs_ioc_setflags_prepare(inode, old_uval, uval);
442
+ if (ret)
443
+ return ret;
519444 val = uval;
520445 gossip_debug(GOSSIP_FILE_DEBUG,
521446 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
522447 (unsigned long long)val);
523
- ret = orangefs_inode_setxattr(file_inode(file),
448
+ ret = orangefs_inode_setxattr(inode,
524449 "user.pvfs2.meta_hint",
525450 &val, sizeof(val), 0);
526451 }
....@@ -532,14 +457,13 @@
532457 {
533458 struct file *file = vmf->vma->vm_file;
534459 int ret;
535
-
536
- ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
537
- STATX_SIZE);
460
+ ret = orangefs_inode_getattr(file->f_mapping->host,
461
+ ORANGEFS_GETATTR_SIZE);
538462 if (ret == -ESTALE)
539463 ret = -EIO;
540464 if (ret) {
541
- gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
542
- __func__, ret);
465
+ gossip_err("%s: orangefs_inode_getattr failed, "
466
+ "ret:%d:.\n", __func__, ret);
543467 return VM_FAULT_SIGBUS;
544468 }
545469 return filemap_fault(vmf);
....@@ -548,7 +472,7 @@
548472 static const struct vm_operations_struct orangefs_file_vm_ops = {
549473 .fault = orangefs_fault,
550474 .map_pages = filemap_map_pages,
551
- .page_mkwrite = filemap_page_mkwrite,
475
+ .page_mkwrite = orangefs_page_mkwrite,
552476 };
553477
554478 /*
....@@ -556,14 +480,17 @@
556480 */
557481 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
558482 {
483
+ int ret;
484
+
485
+ ret = orangefs_revalidate_mapping(file_inode(file));
486
+ if (ret)
487
+ return ret;
488
+
559489 gossip_debug(GOSSIP_FILE_DEBUG,
560490 "orangefs_file_mmap: called on %s\n",
561491 (file ?
562492 (char *)file->f_path.dentry->d_name.name :
563493 (char *)"Unknown"));
564
-
565
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
566
- return -EINVAL;
567494
568495 /* set the sequential readahead hint */
569496 vma->vm_flags |= VM_SEQ_READ;
....@@ -604,8 +531,7 @@
604531 gossip_debug(GOSSIP_INODE_DEBUG,
605532 "flush_racache finished\n");
606533 }
607
- truncate_inode_pages(file_inode(file)->i_mapping,
608
- 0);
534
+
609535 }
610536 return 0;
611537 }
....@@ -622,6 +548,11 @@
622548 struct orangefs_inode_s *orangefs_inode =
623549 ORANGEFS_I(file_inode(file));
624550 struct orangefs_kernel_op_s *new_op = NULL;
551
+
552
+ ret = filemap_write_and_wait_range(file_inode(file)->i_mapping,
553
+ start, end);
554
+ if (ret < 0)
555
+ return ret;
625556
626557 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
627558 if (!new_op)
....@@ -644,7 +575,7 @@
644575 * Change the file pointer position for an instance of an open file.
645576 *
646577 * \note If .llseek is overriden, we must acquire lock as described in
647
- * Documentation/filesystems/Locking.
578
+ * Documentation/filesystems/locking.rst.
648579 *
649580 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
650581 * require much changes to the FS
....@@ -660,8 +591,8 @@
660591 * NOTE: We are only interested in file size here,
661592 * so we set mask accordingly.
662593 */
663
- ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
664
- STATX_SIZE);
594
+ ret = orangefs_inode_getattr(file->f_mapping->host,
595
+ ORANGEFS_GETATTR_SIZE);
665596 if (ret == -ESTALE)
666597 ret = -EIO;
667598 if (ret) {
....@@ -704,6 +635,25 @@
704635 return rc;
705636 }
706637
638
+static int orangefs_flush(struct file *file, fl_owner_t id)
639
+{
640
+ /*
641
+ * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the
642
+ * service_operation in orangefs_fsync.
643
+ *
644
+ * Do not send fsync to OrangeFS server on a close. Do send fsync
645
+ * on an explicit fsync call. This duplicates historical OrangeFS
646
+ * behavior.
647
+ */
648
+ int r;
649
+
650
+ r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
651
+ if (r > 0)
652
+ return 0;
653
+ else
654
+ return r;
655
+}
656
+
707657 /** ORANGEFS implementation of VFS file operations */
708658 const struct file_operations orangefs_file_operations = {
709659 .llseek = orangefs_file_llseek,
....@@ -713,6 +663,7 @@
713663 .unlocked_ioctl = orangefs_ioctl,
714664 .mmap = orangefs_file_mmap,
715665 .open = generic_file_open,
666
+ .flush = orangefs_flush,
716667 .release = orangefs_file_release,
717668 .fsync = orangefs_fsync,
718669 };