hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/orangefs/inode.c
....@@ -1,6 +1,7 @@
11 // SPDX-License-Identifier: GPL-2.0
22 /*
33 * (C) 2001 Clemson University and The University of Chicago
4
+ * Copyright 2018 Omnibond Systems, L.L.C.
45 *
56 * See COPYING in top-level directory.
67 */
....@@ -14,40 +15,280 @@
1415 #include "orangefs-kernel.h"
1516 #include "orangefs-bufmap.h"
1617
17
-static int read_one_page(struct page *page)
18
+static int orangefs_writepage_locked(struct page *page,
19
+ struct writeback_control *wbc)
20
+{
21
+ struct inode *inode = page->mapping->host;
22
+ struct orangefs_write_range *wr = NULL;
23
+ struct iov_iter iter;
24
+ struct bio_vec bv;
25
+ size_t len, wlen;
26
+ ssize_t ret;
27
+ loff_t off;
28
+
29
+ set_page_writeback(page);
30
+
31
+ len = i_size_read(inode);
32
+ if (PagePrivate(page)) {
33
+ wr = (struct orangefs_write_range *)page_private(page);
34
+ WARN_ON(wr->pos >= len);
35
+ off = wr->pos;
36
+ if (off + wr->len > len)
37
+ wlen = len - off;
38
+ else
39
+ wlen = wr->len;
40
+ } else {
41
+ WARN_ON(1);
42
+ off = page_offset(page);
43
+ if (off + PAGE_SIZE > len)
44
+ wlen = len - off;
45
+ else
46
+ wlen = PAGE_SIZE;
47
+ }
48
+ /* Should've been handled in orangefs_invalidatepage. */
49
+ WARN_ON(off == len || off + wlen > len);
50
+
51
+ bv.bv_page = page;
52
+ bv.bv_len = wlen;
53
+ bv.bv_offset = off % PAGE_SIZE;
54
+ WARN_ON(wlen == 0);
55
+ iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
56
+
57
+ ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
58
+ len, wr, NULL, NULL);
59
+ if (ret < 0) {
60
+ SetPageError(page);
61
+ mapping_set_error(page->mapping, ret);
62
+ } else {
63
+ ret = 0;
64
+ }
65
+ kfree(detach_page_private(page));
66
+ return ret;
67
+}
68
+
69
+static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
1870 {
1971 int ret;
20
- int max_block;
21
- ssize_t bytes_read = 0;
22
- struct inode *inode = page->mapping->host;
23
- const __u32 blocksize = PAGE_SIZE;
24
- const __u32 blockbits = PAGE_SHIFT;
25
- struct iov_iter to;
26
- struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
72
+ ret = orangefs_writepage_locked(page, wbc);
73
+ unlock_page(page);
74
+ end_page_writeback(page);
75
+ return ret;
76
+}
2777
28
- iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
78
+struct orangefs_writepages {
79
+ loff_t off;
80
+ size_t len;
81
+ kuid_t uid;
82
+ kgid_t gid;
83
+ int maxpages;
84
+ int npages;
85
+ struct page **pages;
86
+ struct bio_vec *bv;
87
+};
2988
30
- gossip_debug(GOSSIP_INODE_DEBUG,
31
- "orangefs_readpage called with page %p\n",
32
- page);
89
+static int orangefs_writepages_work(struct orangefs_writepages *ow,
90
+ struct writeback_control *wbc)
91
+{
92
+ struct inode *inode = ow->pages[0]->mapping->host;
93
+ struct orangefs_write_range *wrp, wr;
94
+ struct iov_iter iter;
95
+ ssize_t ret;
96
+ size_t len;
97
+ loff_t off;
98
+ int i;
3399
34
- max_block = ((inode->i_size / blocksize) + 1);
100
+ len = i_size_read(inode);
35101
36
- if (page->index < max_block) {
37
- loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
38
-
39
- bytes_read = orangefs_inode_read(inode,
40
- &to,
41
- &blockptr_offset,
42
- inode->i_size);
102
+ for (i = 0; i < ow->npages; i++) {
103
+ set_page_writeback(ow->pages[i]);
104
+ ow->bv[i].bv_page = ow->pages[i];
105
+ ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE,
106
+ ow->off + ow->len) -
107
+ max(ow->off, page_offset(ow->pages[i]));
108
+ if (i == 0)
109
+ ow->bv[i].bv_offset = ow->off -
110
+ page_offset(ow->pages[i]);
111
+ else
112
+ ow->bv[i].bv_offset = 0;
43113 }
114
+ iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
115
+
116
+ WARN_ON(ow->off >= len);
117
+ if (ow->off + ow->len > len)
118
+ ow->len = len - ow->off;
119
+
120
+ off = ow->off;
121
+ wr.uid = ow->uid;
122
+ wr.gid = ow->gid;
123
+ ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
124
+ 0, &wr, NULL, NULL);
125
+ if (ret < 0) {
126
+ for (i = 0; i < ow->npages; i++) {
127
+ SetPageError(ow->pages[i]);
128
+ mapping_set_error(ow->pages[i]->mapping, ret);
129
+ if (PagePrivate(ow->pages[i])) {
130
+ wrp = (struct orangefs_write_range *)
131
+ page_private(ow->pages[i]);
132
+ ClearPagePrivate(ow->pages[i]);
133
+ put_page(ow->pages[i]);
134
+ kfree(wrp);
135
+ }
136
+ end_page_writeback(ow->pages[i]);
137
+ unlock_page(ow->pages[i]);
138
+ }
139
+ } else {
140
+ ret = 0;
141
+ for (i = 0; i < ow->npages; i++) {
142
+ if (PagePrivate(ow->pages[i])) {
143
+ wrp = (struct orangefs_write_range *)
144
+ page_private(ow->pages[i]);
145
+ ClearPagePrivate(ow->pages[i]);
146
+ put_page(ow->pages[i]);
147
+ kfree(wrp);
148
+ }
149
+ end_page_writeback(ow->pages[i]);
150
+ unlock_page(ow->pages[i]);
151
+ }
152
+ }
153
+ return ret;
154
+}
155
+
156
+static int orangefs_writepages_callback(struct page *page,
157
+ struct writeback_control *wbc, void *data)
158
+{
159
+ struct orangefs_writepages *ow = data;
160
+ struct orangefs_write_range *wr;
161
+ int ret;
162
+
163
+ if (!PagePrivate(page)) {
164
+ unlock_page(page);
165
+ /* It's not private so there's nothing to write, right? */
166
+ printk("writepages_callback not private!\n");
167
+ BUG();
168
+ return 0;
169
+ }
170
+ wr = (struct orangefs_write_range *)page_private(page);
171
+
172
+ ret = -1;
173
+ if (ow->npages == 0) {
174
+ ow->off = wr->pos;
175
+ ow->len = wr->len;
176
+ ow->uid = wr->uid;
177
+ ow->gid = wr->gid;
178
+ ow->pages[ow->npages++] = page;
179
+ ret = 0;
180
+ goto done;
181
+ }
182
+ if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
183
+ orangefs_writepages_work(ow, wbc);
184
+ ow->npages = 0;
185
+ ret = -1;
186
+ goto done;
187
+ }
188
+ if (ow->off + ow->len == wr->pos) {
189
+ ow->len += wr->len;
190
+ ow->pages[ow->npages++] = page;
191
+ ret = 0;
192
+ goto done;
193
+ }
194
+done:
195
+ if (ret == -1) {
196
+ if (ow->npages) {
197
+ orangefs_writepages_work(ow, wbc);
198
+ ow->npages = 0;
199
+ }
200
+ ret = orangefs_writepage_locked(page, wbc);
201
+ mapping_set_error(page->mapping, ret);
202
+ unlock_page(page);
203
+ end_page_writeback(page);
204
+ } else {
205
+ if (ow->npages == ow->maxpages) {
206
+ orangefs_writepages_work(ow, wbc);
207
+ ow->npages = 0;
208
+ }
209
+ }
210
+ return ret;
211
+}
212
+
213
+static int orangefs_writepages(struct address_space *mapping,
214
+ struct writeback_control *wbc)
215
+{
216
+ struct orangefs_writepages *ow;
217
+ struct blk_plug plug;
218
+ int ret;
219
+ ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
220
+ if (!ow)
221
+ return -ENOMEM;
222
+ ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
223
+ ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
224
+ if (!ow->pages) {
225
+ kfree(ow);
226
+ return -ENOMEM;
227
+ }
228
+ ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
229
+ if (!ow->bv) {
230
+ kfree(ow->pages);
231
+ kfree(ow);
232
+ return -ENOMEM;
233
+ }
234
+ blk_start_plug(&plug);
235
+ ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
236
+ if (ow->npages)
237
+ ret = orangefs_writepages_work(ow, wbc);
238
+ blk_finish_plug(&plug);
239
+ kfree(ow->pages);
240
+ kfree(ow->bv);
241
+ kfree(ow);
242
+ return ret;
243
+}
244
+
245
+static int orangefs_launder_page(struct page *);
246
+
247
+static int orangefs_readpage(struct file *file, struct page *page)
248
+{
249
+ struct inode *inode = page->mapping->host;
250
+ struct iov_iter iter;
251
+ struct bio_vec bv;
252
+ ssize_t ret;
253
+ loff_t off; /* offset into this page */
254
+ pgoff_t index; /* which page */
255
+ struct page *next_page;
256
+ char *kaddr;
257
+ loff_t read_size;
258
+ int buffer_index = -1; /* orangefs shared memory slot */
259
+ int slot_index; /* index into slot */
260
+ int remaining;
261
+
262
+ /*
263
+ * Get up to this many bytes from Orangefs at a time and try
264
+ * to fill them into the page cache at once. Tests with dd made
265
+ * this seem like a reasonable static number, if there was
266
+ * interest perhaps this number could be made setable through
267
+ * sysfs...
268
+ */
269
+ read_size = 524288;
270
+
271
+ if (PageDirty(page))
272
+ orangefs_launder_page(page);
273
+
274
+ off = page_offset(page);
275
+ index = off >> PAGE_SHIFT;
276
+ bv.bv_page = page;
277
+ bv.bv_len = PAGE_SIZE;
278
+ bv.bv_offset = 0;
279
+ iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
280
+
281
+ ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
282
+ read_size, inode->i_size, NULL, &buffer_index, file);
283
+ remaining = ret;
44284 /* this will only zero remaining unread portions of the page data */
45
- iov_iter_zero(~0U, &to);
285
+ iov_iter_zero(~0U, &iter);
46286 /* takes care of potential aliasing */
47287 flush_dcache_page(page);
48
- if (bytes_read < 0) {
49
- ret = bytes_read;
288
+ if (ret < 0) {
50289 SetPageError(page);
290
+ unlock_page(page);
291
+ goto out;
51292 } else {
52293 SetPageUptodate(page);
53294 if (PageError(page))
....@@ -56,95 +297,453 @@
56297 }
57298 /* unlock the page after the ->readpage() routine completes */
58299 unlock_page(page);
300
+
301
+ if (remaining > PAGE_SIZE) {
302
+ slot_index = 0;
303
+ while ((remaining - PAGE_SIZE) >= PAGE_SIZE) {
304
+ remaining -= PAGE_SIZE;
305
+ /*
306
+ * It is an optimization to try and fill more than one
307
+ * page... by now we've already gotten the single
308
+ * page we were after, if stuff doesn't seem to
309
+ * be going our way at this point just return
310
+ * and hope for the best.
311
+ *
312
+ * If we look for pages and they're already there is
313
+ * one reason to give up, and if they're not there
314
+ * and we can't create them is another reason.
315
+ */
316
+
317
+ index++;
318
+ slot_index++;
319
+ next_page = find_get_page(inode->i_mapping, index);
320
+ if (next_page) {
321
+ gossip_debug(GOSSIP_FILE_DEBUG,
322
+ "%s: found next page, quitting\n",
323
+ __func__);
324
+ put_page(next_page);
325
+ goto out;
326
+ }
327
+ next_page = find_or_create_page(inode->i_mapping,
328
+ index,
329
+ GFP_KERNEL);
330
+ /*
331
+ * I've never hit this, leave it as a printk for
332
+ * now so it will be obvious.
333
+ */
334
+ if (!next_page) {
335
+ printk("%s: can't create next page, quitting\n",
336
+ __func__);
337
+ goto out;
338
+ }
339
+ kaddr = kmap_atomic(next_page);
340
+ orangefs_bufmap_page_fill(kaddr,
341
+ buffer_index,
342
+ slot_index);
343
+ kunmap_atomic(kaddr);
344
+ SetPageUptodate(next_page);
345
+ unlock_page(next_page);
346
+ put_page(next_page);
347
+ }
348
+ }
349
+
350
+out:
351
+ if (buffer_index != -1)
352
+ orangefs_bufmap_put(buffer_index);
59353 return ret;
60354 }
61355
62
-static int orangefs_readpage(struct file *file, struct page *page)
356
+static int orangefs_write_begin(struct file *file,
357
+ struct address_space *mapping,
358
+ loff_t pos, unsigned len, unsigned flags, struct page **pagep,
359
+ void **fsdata)
63360 {
64
- return read_one_page(page);
65
-}
66
-
67
-static int orangefs_readpages(struct file *file,
68
- struct address_space *mapping,
69
- struct list_head *pages,
70
- unsigned nr_pages)
71
-{
72
- int page_idx;
361
+ struct orangefs_write_range *wr;
362
+ struct page *page;
363
+ pgoff_t index;
73364 int ret;
74365
75
- gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
366
+ index = pos >> PAGE_SHIFT;
76367
77
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
78
- struct page *page;
368
+ page = grab_cache_page_write_begin(mapping, index, flags);
369
+ if (!page)
370
+ return -ENOMEM;
79371
80
- page = list_entry(pages->prev, struct page, lru);
81
- list_del(&page->lru);
82
- if (!add_to_page_cache(page,
83
- mapping,
84
- page->index,
85
- readahead_gfp_mask(mapping))) {
86
- ret = read_one_page(page);
87
- gossip_debug(GOSSIP_INODE_DEBUG,
88
- "failure adding page to cache, read_one_page returned: %d\n",
89
- ret);
90
- } else {
91
- put_page(page);
92
- }
372
+ *pagep = page;
373
+
374
+ if (PageDirty(page) && !PagePrivate(page)) {
375
+ /*
376
+ * Should be impossible. If it happens, launder the page
377
+ * since we don't know what's dirty. This will WARN in
378
+ * orangefs_writepage_locked.
379
+ */
380
+ ret = orangefs_launder_page(page);
381
+ if (ret)
382
+ return ret;
93383 }
94
- BUG_ON(!list_empty(pages));
384
+ if (PagePrivate(page)) {
385
+ struct orangefs_write_range *wr;
386
+ wr = (struct orangefs_write_range *)page_private(page);
387
+ if (wr->pos + wr->len == pos &&
388
+ uid_eq(wr->uid, current_fsuid()) &&
389
+ gid_eq(wr->gid, current_fsgid())) {
390
+ wr->len += len;
391
+ goto okay;
392
+ } else {
393
+ ret = orangefs_launder_page(page);
394
+ if (ret)
395
+ return ret;
396
+ }
397
+ }
398
+
399
+ wr = kmalloc(sizeof *wr, GFP_KERNEL);
400
+ if (!wr)
401
+ return -ENOMEM;
402
+
403
+ wr->pos = pos;
404
+ wr->len = len;
405
+ wr->uid = current_fsuid();
406
+ wr->gid = current_fsgid();
407
+ attach_page_private(page, wr);
408
+okay:
95409 return 0;
410
+}
411
+
412
+static int orangefs_write_end(struct file *file, struct address_space *mapping,
413
+ loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
414
+{
415
+ struct inode *inode = page->mapping->host;
416
+ loff_t last_pos = pos + copied;
417
+
418
+ /*
419
+ * No need to use i_size_read() here, the i_size
420
+ * cannot change under us because we hold the i_mutex.
421
+ */
422
+ if (last_pos > inode->i_size)
423
+ i_size_write(inode, last_pos);
424
+
425
+ /* zero the stale part of the page if we did a short copy */
426
+ if (!PageUptodate(page)) {
427
+ unsigned from = pos & (PAGE_SIZE - 1);
428
+ if (copied < len) {
429
+ zero_user(page, from + copied, len - copied);
430
+ }
431
+ /* Set fully written pages uptodate. */
432
+ if (pos == page_offset(page) &&
433
+ (len == PAGE_SIZE || pos + len == inode->i_size)) {
434
+ zero_user_segment(page, from + copied, PAGE_SIZE);
435
+ SetPageUptodate(page);
436
+ }
437
+ }
438
+
439
+ set_page_dirty(page);
440
+ unlock_page(page);
441
+ put_page(page);
442
+
443
+ mark_inode_dirty_sync(file_inode(file));
444
+ return copied;
96445 }
97446
98447 static void orangefs_invalidatepage(struct page *page,
99448 unsigned int offset,
100449 unsigned int length)
101450 {
102
- gossip_debug(GOSSIP_INODE_DEBUG,
103
- "orangefs_invalidatepage called on page %p "
104
- "(offset is %u)\n",
105
- page,
106
- offset);
451
+ struct orangefs_write_range *wr;
452
+ wr = (struct orangefs_write_range *)page_private(page);
107453
108
- ClearPageUptodate(page);
109
- ClearPageMappedToDisk(page);
110
- return;
454
+ if (offset == 0 && length == PAGE_SIZE) {
455
+ kfree(detach_page_private(page));
456
+ return;
457
+ /* write range entirely within invalidate range (or equal) */
458
+ } else if (page_offset(page) + offset <= wr->pos &&
459
+ wr->pos + wr->len <= page_offset(page) + offset + length) {
460
+ kfree(detach_page_private(page));
461
+ /* XXX is this right? only caller in fs */
462
+ cancel_dirty_page(page);
463
+ return;
464
+ /* invalidate range chops off end of write range */
465
+ } else if (wr->pos < page_offset(page) + offset &&
466
+ wr->pos + wr->len <= page_offset(page) + offset + length &&
467
+ page_offset(page) + offset < wr->pos + wr->len) {
468
+ size_t x;
469
+ x = wr->pos + wr->len - (page_offset(page) + offset);
470
+ WARN_ON(x > wr->len);
471
+ wr->len -= x;
472
+ wr->uid = current_fsuid();
473
+ wr->gid = current_fsgid();
474
+ /* invalidate range chops off beginning of write range */
475
+ } else if (page_offset(page) + offset <= wr->pos &&
476
+ page_offset(page) + offset + length < wr->pos + wr->len &&
477
+ wr->pos < page_offset(page) + offset + length) {
478
+ size_t x;
479
+ x = page_offset(page) + offset + length - wr->pos;
480
+ WARN_ON(x > wr->len);
481
+ wr->pos += x;
482
+ wr->len -= x;
483
+ wr->uid = current_fsuid();
484
+ wr->gid = current_fsgid();
485
+ /* invalidate range entirely within write range (punch hole) */
486
+ } else if (wr->pos < page_offset(page) + offset &&
487
+ page_offset(page) + offset + length < wr->pos + wr->len) {
488
+ /* XXX what do we do here... should not WARN_ON */
489
+ WARN_ON(1);
490
+ /* punch hole */
491
+ /*
492
+ * should we just ignore this and write it out anyway?
493
+ * it hardly makes sense
494
+ */
495
+ return;
496
+ /* non-overlapping ranges */
497
+ } else {
498
+ /* WARN if they do overlap */
499
+ if (!((page_offset(page) + offset + length <= wr->pos) ^
500
+ (wr->pos + wr->len <= page_offset(page) + offset))) {
501
+ WARN_ON(1);
502
+ printk("invalidate range offset %llu length %u\n",
503
+ page_offset(page) + offset, length);
504
+ printk("write range offset %llu length %zu\n",
505
+ wr->pos, wr->len);
506
+ }
507
+ return;
508
+ }
111509
510
+ /*
511
+ * Above there are returns where wr is freed or where we WARN.
512
+ * Thus the following runs if wr was modified above.
513
+ */
514
+
515
+ orangefs_launder_page(page);
112516 }
113517
114518 static int orangefs_releasepage(struct page *page, gfp_t foo)
115519 {
116
- gossip_debug(GOSSIP_INODE_DEBUG,
117
- "orangefs_releasepage called on page %p\n",
118
- page);
119
- return 0;
520
+ return !PagePrivate(page);
120521 }
121522
122
-/*
123
- * Having a direct_IO entry point in the address_space_operations
124
- * struct causes the kernel to allows us to use O_DIRECT on
125
- * open. Nothing will ever call this thing, but in the future we
126
- * will need to be able to use O_DIRECT on open in order to support
127
- * AIO. Modeled after NFS, they do this too.
128
- */
523
+static void orangefs_freepage(struct page *page)
524
+{
525
+ kfree(detach_page_private(page));
526
+}
527
+
528
+static int orangefs_launder_page(struct page *page)
529
+{
530
+ int r = 0;
531
+ struct writeback_control wbc = {
532
+ .sync_mode = WB_SYNC_ALL,
533
+ .nr_to_write = 0,
534
+ };
535
+ wait_on_page_writeback(page);
536
+ if (clear_page_dirty_for_io(page)) {
537
+ r = orangefs_writepage_locked(page, &wbc);
538
+ end_page_writeback(page);
539
+ }
540
+ return r;
541
+}
129542
130543 static ssize_t orangefs_direct_IO(struct kiocb *iocb,
131544 struct iov_iter *iter)
132545 {
133
- gossip_debug(GOSSIP_INODE_DEBUG,
134
- "orangefs_direct_IO: %pD\n",
135
- iocb->ki_filp);
546
+ /*
547
+ * Comment from original do_readv_writev:
548
+ * Common entry point for read/write/readv/writev
549
+ * This function will dispatch it to either the direct I/O
550
+ * or buffered I/O path depending on the mount options and/or
551
+ * augmented/extended metadata attached to the file.
552
+ * Note: File extended attributes override any mount options.
553
+ */
554
+ struct file *file = iocb->ki_filp;
555
+ loff_t pos = iocb->ki_pos;
556
+ enum ORANGEFS_io_type type = iov_iter_rw(iter) == WRITE ?
557
+ ORANGEFS_IO_WRITE : ORANGEFS_IO_READ;
558
+ loff_t *offset = &pos;
559
+ struct inode *inode = file->f_mapping->host;
560
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
561
+ struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
562
+ size_t count = iov_iter_count(iter);
563
+ ssize_t total_count = 0;
564
+ ssize_t ret = -EINVAL;
565
+ int i = 0;
136566
137
- return -EINVAL;
567
+ gossip_debug(GOSSIP_FILE_DEBUG,
568
+ "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
569
+ __func__,
570
+ handle,
571
+ (int)count);
572
+
573
+ if (type == ORANGEFS_IO_WRITE) {
574
+ gossip_debug(GOSSIP_FILE_DEBUG,
575
+ "%s(%pU): proceeding with offset : %llu, "
576
+ "size %d\n",
577
+ __func__,
578
+ handle,
579
+ llu(*offset),
580
+ (int)count);
581
+ }
582
+
583
+ if (count == 0) {
584
+ ret = 0;
585
+ goto out;
586
+ }
587
+
588
+ while (iov_iter_count(iter)) {
589
+ size_t each_count = iov_iter_count(iter);
590
+ size_t amt_complete;
591
+ i++;
592
+
593
+ /* how much to transfer in this loop iteration */
594
+ if (each_count > orangefs_bufmap_size_query())
595
+ each_count = orangefs_bufmap_size_query();
596
+
597
+ gossip_debug(GOSSIP_FILE_DEBUG,
598
+ "%s(%pU): size of each_count(%d)\n",
599
+ __func__,
600
+ handle,
601
+ (int)each_count);
602
+ gossip_debug(GOSSIP_FILE_DEBUG,
603
+ "%s(%pU): BEFORE wait_for_io: offset is %d\n",
604
+ __func__,
605
+ handle,
606
+ (int)*offset);
607
+
608
+ ret = wait_for_direct_io(type, inode, offset, iter,
609
+ each_count, 0, NULL, NULL, file);
610
+ gossip_debug(GOSSIP_FILE_DEBUG,
611
+ "%s(%pU): return from wait_for_io:%d\n",
612
+ __func__,
613
+ handle,
614
+ (int)ret);
615
+
616
+ if (ret < 0)
617
+ goto out;
618
+
619
+ *offset += ret;
620
+ total_count += ret;
621
+ amt_complete = ret;
622
+
623
+ gossip_debug(GOSSIP_FILE_DEBUG,
624
+ "%s(%pU): AFTER wait_for_io: offset is %d\n",
625
+ __func__,
626
+ handle,
627
+ (int)*offset);
628
+
629
+ /*
630
+ * if we got a short I/O operations,
631
+ * fall out and return what we got so far
632
+ */
633
+ if (amt_complete < each_count)
634
+ break;
635
+ } /*end while */
636
+
637
+out:
638
+ if (total_count > 0)
639
+ ret = total_count;
640
+ if (ret > 0) {
641
+ if (type == ORANGEFS_IO_READ) {
642
+ file_accessed(file);
643
+ } else {
644
+ file_update_time(file);
645
+ if (*offset > i_size_read(inode))
646
+ i_size_write(inode, *offset);
647
+ }
648
+ }
649
+
650
+ gossip_debug(GOSSIP_FILE_DEBUG,
651
+ "%s(%pU): Value(%d) returned.\n",
652
+ __func__,
653
+ handle,
654
+ (int)ret);
655
+
656
+ return ret;
138657 }
139658
140659 /** ORANGEFS2 implementation of address space operations */
141660 static const struct address_space_operations orangefs_address_operations = {
661
+ .writepage = orangefs_writepage,
142662 .readpage = orangefs_readpage,
143
- .readpages = orangefs_readpages,
663
+ .writepages = orangefs_writepages,
664
+ .set_page_dirty = __set_page_dirty_nobuffers,
665
+ .write_begin = orangefs_write_begin,
666
+ .write_end = orangefs_write_end,
144667 .invalidatepage = orangefs_invalidatepage,
145668 .releasepage = orangefs_releasepage,
669
+ .freepage = orangefs_freepage,
670
+ .launder_page = orangefs_launder_page,
146671 .direct_IO = orangefs_direct_IO,
147672 };
673
+
674
+vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
675
+{
676
+ struct page *page = vmf->page;
677
+ struct inode *inode = file_inode(vmf->vma->vm_file);
678
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
679
+ unsigned long *bitlock = &orangefs_inode->bitlock;
680
+ vm_fault_t ret;
681
+ struct orangefs_write_range *wr;
682
+
683
+ sb_start_pagefault(inode->i_sb);
684
+
685
+ if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) {
686
+ ret = VM_FAULT_RETRY;
687
+ goto out;
688
+ }
689
+
690
+ lock_page(page);
691
+ if (PageDirty(page) && !PagePrivate(page)) {
692
+ /*
693
+ * Should be impossible. If it happens, launder the page
694
+ * since we don't know what's dirty. This will WARN in
695
+ * orangefs_writepage_locked.
696
+ */
697
+ if (orangefs_launder_page(page)) {
698
+ ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
699
+ goto out;
700
+ }
701
+ }
702
+ if (PagePrivate(page)) {
703
+ wr = (struct orangefs_write_range *)page_private(page);
704
+ if (uid_eq(wr->uid, current_fsuid()) &&
705
+ gid_eq(wr->gid, current_fsgid())) {
706
+ wr->pos = page_offset(page);
707
+ wr->len = PAGE_SIZE;
708
+ goto okay;
709
+ } else {
710
+ if (orangefs_launder_page(page)) {
711
+ ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
712
+ goto out;
713
+ }
714
+ }
715
+ }
716
+ wr = kmalloc(sizeof *wr, GFP_KERNEL);
717
+ if (!wr) {
718
+ ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
719
+ goto out;
720
+ }
721
+ wr->pos = page_offset(page);
722
+ wr->len = PAGE_SIZE;
723
+ wr->uid = current_fsuid();
724
+ wr->gid = current_fsgid();
725
+ attach_page_private(page, wr);
726
+okay:
727
+
728
+ file_update_time(vmf->vma->vm_file);
729
+ if (page->mapping != inode->i_mapping) {
730
+ unlock_page(page);
731
+ ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
732
+ goto out;
733
+ }
734
+
735
+ /*
736
+ * We mark the page dirty already here so that when freeze is in
737
+ * progress, we are guaranteed that writeback during freezing will
738
+ * see the dirty page and writeprotect it again.
739
+ */
740
+ set_page_dirty(page);
741
+ wait_for_stable_page(page);
742
+ ret = VM_FAULT_LOCKED;
743
+out:
744
+ sb_end_pagefault(inode->i_sb);
745
+ return ret;
746
+}
148747
149748 static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
150749 {
....@@ -162,7 +761,7 @@
162761 iattr->ia_size);
163762
164763 /* Ensure that we have a up to date size, so we know if it changed. */
165
- ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE);
764
+ ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE);
166765 if (ret == -ESTALE)
167766 ret = -EIO;
168767 if (ret) {
....@@ -172,7 +771,11 @@
172771 }
173772 orig_size = i_size_read(inode);
174773
175
- truncate_setsize(inode, iattr->ia_size);
774
+ /* This is truncate_setsize in a different order. */
775
+ truncate_pagecache(inode, iattr->ia_size);
776
+ i_size_write(inode, iattr->ia_size);
777
+ if (iattr->ia_size > orig_size)
778
+ pagecache_isize_extended(inode, orig_size, iattr->ia_size);
176779
177780 new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
178781 if (!new_op)
....@@ -202,22 +805,33 @@
202805 return ret;
203806 }
204807
205
-/*
206
- * Change attributes of an object referenced by dentry.
207
- */
208
-int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
808
+int __orangefs_setattr(struct inode *inode, struct iattr *iattr)
209809 {
210
- int ret = -EINVAL;
211
- struct inode *inode = dentry->d_inode;
810
+ int ret;
212811
213
- gossip_debug(GOSSIP_INODE_DEBUG,
214
- "%s: called on %pd\n",
215
- __func__,
216
- dentry);
217
-
218
- ret = setattr_prepare(dentry, iattr);
219
- if (ret)
220
- goto out;
812
+ if (iattr->ia_valid & ATTR_MODE) {
813
+ if (iattr->ia_mode & (S_ISVTX)) {
814
+ if (is_root_handle(inode)) {
815
+ /*
816
+ * allow sticky bit to be set on root (since
817
+ * it shows up that way by default anyhow),
818
+ * but don't show it to the server
819
+ */
820
+ iattr->ia_mode -= S_ISVTX;
821
+ } else {
822
+ gossip_debug(GOSSIP_UTILS_DEBUG,
823
+ "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
824
+ ret = -EINVAL;
825
+ goto out;
826
+ }
827
+ }
828
+ if (iattr->ia_mode & (S_ISUID)) {
829
+ gossip_debug(GOSSIP_UTILS_DEBUG,
830
+ "Attempting to set setuid bit (not supported); returning EINVAL.\n");
831
+ ret = -EINVAL;
832
+ goto out;
833
+ }
834
+ }
221835
222836 if (iattr->ia_valid & ATTR_SIZE) {
223837 ret = orangefs_setattr_size(inode, iattr);
....@@ -225,21 +839,51 @@
225839 goto out;
226840 }
227841
842
+again:
843
+ spin_lock(&inode->i_lock);
844
+ if (ORANGEFS_I(inode)->attr_valid) {
845
+ if (uid_eq(ORANGEFS_I(inode)->attr_uid, current_fsuid()) &&
846
+ gid_eq(ORANGEFS_I(inode)->attr_gid, current_fsgid())) {
847
+ ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
848
+ } else {
849
+ spin_unlock(&inode->i_lock);
850
+ write_inode_now(inode, 1);
851
+ goto again;
852
+ }
853
+ } else {
854
+ ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
855
+ ORANGEFS_I(inode)->attr_uid = current_fsuid();
856
+ ORANGEFS_I(inode)->attr_gid = current_fsgid();
857
+ }
228858 setattr_copy(inode, iattr);
859
+ spin_unlock(&inode->i_lock);
229860 mark_inode_dirty(inode);
230861
231
- ret = orangefs_inode_setattr(inode, iattr);
232
- gossip_debug(GOSSIP_INODE_DEBUG,
233
- "%s: orangefs_inode_setattr returned %d\n",
234
- __func__,
235
- ret);
236
-
237
- if (!ret && (iattr->ia_valid & ATTR_MODE))
862
+ if (iattr->ia_valid & ATTR_MODE)
238863 /* change mod on a file that has ACLs */
239864 ret = posix_acl_chmod(inode, inode->i_mode);
240865
866
+ ret = 0;
241867 out:
242
- gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret);
868
+ return ret;
869
+}
870
+
871
+/*
872
+ * Change attributes of an object referenced by dentry.
873
+ */
874
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
875
+{
876
+ int ret;
877
+ gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
878
+ dentry);
879
+ ret = setattr_prepare(dentry, iattr);
880
+ if (ret)
881
+ goto out;
882
+ ret = __orangefs_setattr(d_inode(dentry), iattr);
883
+ sync_inode_metadata(d_inode(dentry), 1);
884
+out:
885
+ gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
886
+ ret);
243887 return ret;
244888 }
245889
....@@ -249,23 +893,21 @@
249893 int orangefs_getattr(const struct path *path, struct kstat *stat,
250894 u32 request_mask, unsigned int flags)
251895 {
252
- int ret = -ENOENT;
896
+ int ret;
253897 struct inode *inode = path->dentry->d_inode;
254898
255899 gossip_debug(GOSSIP_INODE_DEBUG,
256
- "orangefs_getattr: called on %pd\n",
257
- path->dentry);
900
+ "orangefs_getattr: called on %pd mask %u\n",
901
+ path->dentry, request_mask);
258902
259
- ret = orangefs_inode_getattr(inode, 0, 0, request_mask);
903
+ ret = orangefs_inode_getattr(inode,
904
+ request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
260905 if (ret == 0) {
261906 generic_fillattr(inode, stat);
262907
263908 /* override block size reported to stat */
264
- if (request_mask & STATX_SIZE)
265
- stat->result_mask = STATX_BASIC_STATS;
266
- else
267
- stat->result_mask = STATX_BASIC_STATS &
268
- ~STATX_SIZE;
909
+ if (!(request_mask & STATX_SIZE))
910
+ stat->result_mask &= ~STATX_SIZE;
269911
270912 stat->attributes_mask = STATX_ATTR_IMMUTABLE |
271913 STATX_ATTR_APPEND;
....@@ -287,7 +929,7 @@
287929 gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
288930
289931 /* Make sure the permission (and other common attrs) are up to date. */
290
- ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE);
932
+ ret = orangefs_inode_getattr(inode, 0);
291933 if (ret < 0)
292934 return ret;
293935
....@@ -307,7 +949,7 @@
307949 iattr.ia_valid |= ATTR_CTIME;
308950 if (flags & S_MTIME)
309951 iattr.ia_valid |= ATTR_MTIME;
310
- return orangefs_inode_setattr(inode, &iattr);
952
+ return __orangefs_setattr(inode, &iattr);
311953 }
312954
313955 /* ORANGEFS2 implementation of VFS inode operations for files */
....@@ -367,6 +1009,10 @@
3671009 struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
3681010 ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
3691011 ORANGEFS_I(inode)->refn.khandle = ref->khandle;
1012
+ ORANGEFS_I(inode)->attr_valid = 0;
1013
+ hash_init(ORANGEFS_I(inode)->xattr_cache);
1014
+ ORANGEFS_I(inode)->mapping_time = jiffies - 1;
1015
+ ORANGEFS_I(inode)->bitlock = 0;
3701016 return 0;
3711017 }
3721018
....@@ -405,10 +1051,14 @@
4051051 orangefs_test_inode,
4061052 orangefs_set_inode,
4071053 ref);
408
- if (!inode || !(inode->i_state & I_NEW))
1054
+
1055
+ if (!inode)
1056
+ return ERR_PTR(-ENOMEM);
1057
+
1058
+ if (!(inode->i_state & I_NEW))
4091059 return inode;
4101060
411
- error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
1061
+ error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
4121062 if (error) {
4131063 iget_failed(inode);
4141064 return ERR_PTR(error);
....@@ -448,22 +1098,16 @@
4481098
4491099 inode = new_inode(sb);
4501100 if (!inode)
451
- return NULL;
1101
+ return ERR_PTR(-ENOMEM);
4521102
4531103 orangefs_set_inode(inode, ref);
4541104 inode->i_ino = hash; /* needed for stat etc */
4551105
456
- error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
1106
+ error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
4571107 if (error)
4581108 goto out_iput;
4591109
4601110 orangefs_init_iops(inode);
461
-
462
- inode->i_mode = mode;
463
- inode->i_uid = current_fsuid();
464
- inode->i_gid = current_fsgid();
465
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
466
- inode->i_size = PAGE_SIZE;
4671111 inode->i_rdev = dev;
4681112
4691113 error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);