.. | .. |
---|
10 | 10 | #include <linux/pagevec.h> |
---|
11 | 11 | #include <linux/task_io_accounting_ops.h> |
---|
12 | 12 | #include <linux/signal.h> |
---|
| 13 | +#include <linux/iversion.h> |
---|
| 14 | +#include <linux/ktime.h> |
---|
13 | 15 | |
---|
14 | 16 | #include "super.h" |
---|
15 | 17 | #include "mds_client.h" |
---|
16 | 18 | #include "cache.h" |
---|
| 19 | +#include "metric.h" |
---|
17 | 20 | #include <linux/ceph/osd_client.h> |
---|
18 | 21 | #include <linux/ceph/striper.h> |
---|
19 | 22 | |
---|
.. | .. |
---|
150 | 153 | if (!PagePrivate(page)) |
---|
151 | 154 | return; |
---|
152 | 155 | |
---|
153 | | - ClearPageChecked(page); |
---|
154 | | - |
---|
155 | 156 | dout("%p invalidatepage %p idx %lu full dirty page\n", |
---|
156 | 157 | inode, page, page->index); |
---|
157 | 158 | |
---|
.. | .. |
---|
173 | 174 | return !PagePrivate(page); |
---|
174 | 175 | } |
---|
175 | 176 | |
---|
176 | | -/* |
---|
177 | | - * read a single page, without unlocking it. |
---|
178 | | - */ |
---|
| 177 | +/* read a single page, without unlocking it. */ |
---|
179 | 178 | static int ceph_do_readpage(struct file *filp, struct page *page) |
---|
180 | 179 | { |
---|
181 | 180 | struct inode *inode = file_inode(filp); |
---|
182 | 181 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
183 | | - struct ceph_osd_client *osdc = |
---|
184 | | - &ceph_inode_to_client(inode)->client->osdc; |
---|
| 182 | + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
| 183 | + struct ceph_osd_client *osdc = &fsc->client->osdc; |
---|
| 184 | + struct ceph_osd_request *req; |
---|
| 185 | + struct ceph_vino vino = ceph_vino(inode); |
---|
185 | 186 | int err = 0; |
---|
186 | 187 | u64 off = page_offset(page); |
---|
187 | 188 | u64 len = PAGE_SIZE; |
---|
.. | .. |
---|
208 | 209 | if (err == 0) |
---|
209 | 210 | return -EINPROGRESS; |
---|
210 | 211 | |
---|
211 | | - dout("readpage inode %p file %p page %p index %lu\n", |
---|
212 | | - inode, filp, page, page->index); |
---|
213 | | - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, |
---|
214 | | - off, &len, |
---|
215 | | - ci->i_truncate_seq, ci->i_truncate_size, |
---|
216 | | - &page, 1, 0); |
---|
| 212 | + dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", |
---|
| 213 | + vino.ino, vino.snap, filp, off, len, page, page->index); |
---|
| 214 | + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, |
---|
| 215 | + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, |
---|
| 216 | + ci->i_truncate_seq, ci->i_truncate_size, |
---|
| 217 | + false); |
---|
| 218 | + if (IS_ERR(req)) |
---|
| 219 | + return PTR_ERR(req); |
---|
| 220 | + |
---|
| 221 | + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); |
---|
| 222 | + |
---|
| 223 | + err = ceph_osdc_start_request(osdc, req, false); |
---|
| 224 | + if (!err) |
---|
| 225 | + err = ceph_osdc_wait_request(osdc, req); |
---|
| 226 | + |
---|
| 227 | + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, |
---|
| 228 | + req->r_end_latency, err); |
---|
| 229 | + |
---|
| 230 | + ceph_osdc_put_request(req); |
---|
| 231 | + dout("readpage result %d\n", err); |
---|
| 232 | + |
---|
217 | 233 | if (err == -ENOENT) |
---|
218 | 234 | err = 0; |
---|
219 | 235 | if (err < 0) { |
---|
220 | | - SetPageError(page); |
---|
221 | 236 | ceph_fscache_readpage_cancel(inode, page); |
---|
| 237 | + if (err == -EBLOCKLISTED) |
---|
| 238 | + fsc->blocklisted = true; |
---|
222 | 239 | goto out; |
---|
223 | 240 | } |
---|
224 | 241 | if (err < PAGE_SIZE) |
---|
.. | .. |
---|
250 | 267 | static void finish_read(struct ceph_osd_request *req) |
---|
251 | 268 | { |
---|
252 | 269 | struct inode *inode = req->r_inode; |
---|
| 270 | + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
253 | 271 | struct ceph_osd_data *osd_data; |
---|
254 | 272 | int rc = req->r_result <= 0 ? req->r_result : 0; |
---|
255 | 273 | int bytes = req->r_result >= 0 ? req->r_result : 0; |
---|
.. | .. |
---|
257 | 275 | int i; |
---|
258 | 276 | |
---|
259 | 277 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); |
---|
| 278 | + if (rc == -EBLOCKLISTED) |
---|
| 279 | + ceph_inode_to_client(inode)->blocklisted = true; |
---|
260 | 280 | |
---|
261 | 281 | /* unlock all pages, zeroing any data we didn't read */ |
---|
262 | 282 | osd_data = osd_req_op_extent_osd_data(req, 0); |
---|
.. | .. |
---|
285 | 305 | put_page(page); |
---|
286 | 306 | bytes -= PAGE_SIZE; |
---|
287 | 307 | } |
---|
| 308 | + |
---|
| 309 | + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, |
---|
| 310 | + req->r_end_latency, rc); |
---|
| 311 | + |
---|
288 | 312 | kfree(osd_data->pages); |
---|
289 | 313 | } |
---|
290 | 314 | |
---|
.. | .. |
---|
298 | 322 | struct ceph_osd_client *osdc = |
---|
299 | 323 | &ceph_inode_to_client(inode)->client->osdc; |
---|
300 | 324 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
301 | | - struct page *page = list_entry(page_list->prev, struct page, lru); |
---|
| 325 | + struct page *page = lru_to_page(page_list); |
---|
302 | 326 | struct ceph_vino vino; |
---|
303 | 327 | struct ceph_osd_request *req; |
---|
304 | 328 | u64 off; |
---|
.. | .. |
---|
314 | 338 | /* caller of readpages does not hold buffer and read caps |
---|
315 | 339 | * (fadvise, madvise and readahead cases) */ |
---|
316 | 340 | int want = CEPH_CAP_FILE_CACHE; |
---|
317 | | - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got); |
---|
| 341 | + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, |
---|
| 342 | + true, &got); |
---|
318 | 343 | if (ret < 0) { |
---|
319 | 344 | dout("start_read %p, error getting cap\n", inode); |
---|
320 | 345 | } else if (!(got & want)) { |
---|
.. | .. |
---|
325 | 350 | if (got) |
---|
326 | 351 | ceph_put_cap_refs(ci, got); |
---|
327 | 352 | while (!list_empty(page_list)) { |
---|
328 | | - page = list_entry(page_list->prev, |
---|
329 | | - struct page, lru); |
---|
| 353 | + page = lru_to_page(page_list); |
---|
330 | 354 | list_del(&page->lru); |
---|
331 | 355 | put_page(page); |
---|
332 | 356 | } |
---|
.. | .. |
---|
561 | 585 | /* |
---|
562 | 586 | * Write a single page, but leave the page locked. |
---|
563 | 587 | * |
---|
564 | | - * If we get a write error, set the page error bit, but still adjust the |
---|
| 588 | + * If we get a write error, mark the mapping for error, but still adjust the |
---|
565 | 589 | * dirty page accounting (i.e., page is no longer dirty). |
---|
566 | 590 | */ |
---|
567 | 591 | static int writepage_nounlock(struct page *page, struct writeback_control *wbc) |
---|
568 | 592 | { |
---|
569 | | - struct inode *inode; |
---|
570 | | - struct ceph_inode_info *ci; |
---|
571 | | - struct ceph_fs_client *fsc; |
---|
| 593 | + struct inode *inode = page->mapping->host; |
---|
| 594 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
| 595 | + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
572 | 596 | struct ceph_snap_context *snapc, *oldest; |
---|
573 | 597 | loff_t page_off = page_offset(page); |
---|
574 | | - int err, len = PAGE_SIZE; |
---|
| 598 | + int err; |
---|
| 599 | + loff_t len = PAGE_SIZE; |
---|
575 | 600 | struct ceph_writeback_ctl ceph_wbc; |
---|
| 601 | + struct ceph_osd_client *osdc = &fsc->client->osdc; |
---|
| 602 | + struct ceph_osd_request *req; |
---|
576 | 603 | |
---|
577 | 604 | dout("writepage %p idx %lu\n", page, page->index); |
---|
578 | | - |
---|
579 | | - inode = page->mapping->host; |
---|
580 | | - ci = ceph_inode(inode); |
---|
581 | | - fsc = ceph_inode_to_client(inode); |
---|
582 | 605 | |
---|
583 | 606 | /* verify this is a writeable snap context */ |
---|
584 | 607 | snapc = page_snap_context(page); |
---|
.. | .. |
---|
608 | 631 | if (ceph_wbc.i_size < page_off + len) |
---|
609 | 632 | len = ceph_wbc.i_size - page_off; |
---|
610 | 633 | |
---|
611 | | - dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", |
---|
| 634 | + dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", |
---|
612 | 635 | inode, page, page->index, page_off, len, snapc, snapc->seq); |
---|
613 | 636 | |
---|
614 | 637 | if (atomic_long_inc_return(&fsc->writeback_count) > |
---|
.. | .. |
---|
616 | 639 | set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); |
---|
617 | 640 | |
---|
618 | 641 | set_page_writeback(page); |
---|
619 | | - err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), |
---|
620 | | - &ci->i_layout, snapc, page_off, len, |
---|
621 | | - ceph_wbc.truncate_seq, |
---|
622 | | - ceph_wbc.truncate_size, |
---|
623 | | - &inode->i_mtime, &page, 1); |
---|
| 642 | + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, |
---|
| 643 | + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, |
---|
| 644 | + ceph_wbc.truncate_seq, ceph_wbc.truncate_size, |
---|
| 645 | + true); |
---|
| 646 | + if (IS_ERR(req)) { |
---|
| 647 | + redirty_page_for_writepage(wbc, page); |
---|
| 648 | + end_page_writeback(page); |
---|
| 649 | + return PTR_ERR(req); |
---|
| 650 | + } |
---|
| 651 | + |
---|
| 652 | + /* it may be a short write due to an object boundary */ |
---|
| 653 | + WARN_ON_ONCE(len > PAGE_SIZE); |
---|
| 654 | + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); |
---|
| 655 | + dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); |
---|
| 656 | + |
---|
| 657 | + req->r_mtime = inode->i_mtime; |
---|
| 658 | + err = ceph_osdc_start_request(osdc, req, true); |
---|
| 659 | + if (!err) |
---|
| 660 | + err = ceph_osdc_wait_request(osdc, req); |
---|
| 661 | + |
---|
| 662 | + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, |
---|
| 663 | + req->r_end_latency, err); |
---|
| 664 | + |
---|
| 665 | + ceph_osdc_put_request(req); |
---|
| 666 | + if (err == 0) |
---|
| 667 | + err = len; |
---|
| 668 | + |
---|
624 | 669 | if (err < 0) { |
---|
625 | 670 | struct writeback_control tmp_wbc; |
---|
626 | 671 | if (!wbc) |
---|
.. | .. |
---|
632 | 677 | end_page_writeback(page); |
---|
633 | 678 | return err; |
---|
634 | 679 | } |
---|
| 680 | + if (err == -EBLOCKLISTED) |
---|
| 681 | + fsc->blocklisted = true; |
---|
635 | 682 | dout("writepage setting page/mapping error %d %p\n", |
---|
636 | 683 | err, page); |
---|
637 | | - SetPageError(page); |
---|
638 | 684 | mapping_set_error(&inode->i_data, err); |
---|
639 | 685 | wbc->pages_skipped++; |
---|
640 | 686 | } else { |
---|
.. | .. |
---|
672 | 718 | } |
---|
673 | 719 | |
---|
674 | 720 | /* |
---|
675 | | - * lame release_pages helper. release_pages() isn't exported to |
---|
676 | | - * modules. |
---|
677 | | - */ |
---|
678 | | -static void ceph_release_pages(struct page **pages, int num) |
---|
679 | | -{ |
---|
680 | | - struct pagevec pvec; |
---|
681 | | - int i; |
---|
682 | | - |
---|
683 | | - pagevec_init(&pvec); |
---|
684 | | - for (i = 0; i < num; i++) { |
---|
685 | | - if (pagevec_add(&pvec, pages[i]) == 0) |
---|
686 | | - pagevec_release(&pvec); |
---|
687 | | - } |
---|
688 | | - pagevec_release(&pvec); |
---|
689 | | -} |
---|
690 | | - |
---|
691 | | -/* |
---|
692 | 721 | * async writeback completion handler. |
---|
693 | 722 | * |
---|
694 | 723 | * If we get an error, set the mapping error bit, but not the individual |
---|
.. | .. |
---|
712 | 741 | if (rc < 0) { |
---|
713 | 742 | mapping_set_error(mapping, rc); |
---|
714 | 743 | ceph_set_error_write(ci); |
---|
| 744 | + if (rc == -EBLOCKLISTED) |
---|
| 745 | + fsc->blocklisted = true; |
---|
715 | 746 | } else { |
---|
716 | 747 | ceph_clear_error_write(ci); |
---|
717 | 748 | } |
---|
| 749 | + |
---|
| 750 | + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, |
---|
| 751 | + req->r_end_latency, rc); |
---|
718 | 752 | |
---|
719 | 753 | /* |
---|
720 | 754 | * We lost the cache cap, need to truncate the page before |
---|
.. | .. |
---|
761 | 795 | dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", |
---|
762 | 796 | inode, osd_data->length, rc >= 0 ? num_pages : 0); |
---|
763 | 797 | |
---|
764 | | - ceph_release_pages(osd_data->pages, num_pages); |
---|
| 798 | + release_pages(osd_data->pages, num_pages); |
---|
765 | 799 | } |
---|
766 | 800 | |
---|
767 | 801 | ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); |
---|
768 | 802 | |
---|
769 | 803 | osd_data = osd_req_op_extent_osd_data(req, 0); |
---|
770 | 804 | if (osd_data->pages_from_pool) |
---|
771 | | - mempool_free(osd_data->pages, |
---|
772 | | - ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); |
---|
| 805 | + mempool_free(osd_data->pages, ceph_wb_pagevec_pool); |
---|
773 | 806 | else |
---|
774 | 807 | kfree(osd_data->pages); |
---|
775 | 808 | ceph_osdc_put_request(req); |
---|
.. | .. |
---|
861 | 894 | int num_ops = 0, op_idx; |
---|
862 | 895 | unsigned i, pvec_pages, max_pages, locked_pages = 0; |
---|
863 | 896 | struct page **pages = NULL, **data_pages; |
---|
864 | | - mempool_t *pool = NULL; /* Becomes non-null if mempool used */ |
---|
865 | 897 | struct page *page; |
---|
866 | 898 | pgoff_t strip_unit_end = 0; |
---|
867 | 899 | u64 offset = 0, len = 0; |
---|
| 900 | + bool from_pool = false; |
---|
868 | 901 | |
---|
869 | 902 | max_pages = wsize >> PAGE_SHIFT; |
---|
870 | 903 | |
---|
871 | 904 | get_more_pages: |
---|
872 | | - pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index, |
---|
873 | | - end, PAGECACHE_TAG_DIRTY, |
---|
874 | | - max_pages - locked_pages); |
---|
| 905 | + pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, |
---|
| 906 | + end, PAGECACHE_TAG_DIRTY); |
---|
875 | 907 | dout("pagevec_lookup_range_tag got %d\n", pvec_pages); |
---|
876 | 908 | if (!pvec_pages && !locked_pages) |
---|
877 | 909 | break; |
---|
.. | .. |
---|
963 | 995 | sizeof(*pages), |
---|
964 | 996 | GFP_NOFS); |
---|
965 | 997 | if (!pages) { |
---|
966 | | - pool = fsc->wb_pagevec_pool; |
---|
967 | | - pages = mempool_alloc(pool, GFP_NOFS); |
---|
| 998 | + from_pool = true; |
---|
| 999 | + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); |
---|
968 | 1000 | BUG_ON(!pages); |
---|
969 | 1001 | } |
---|
970 | 1002 | |
---|
971 | 1003 | len = 0; |
---|
972 | 1004 | } else if (page->index != |
---|
973 | 1005 | (offset + len) >> PAGE_SHIFT) { |
---|
974 | | - if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS : |
---|
975 | | - CEPH_OSD_MAX_OPS)) { |
---|
| 1006 | + if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : |
---|
| 1007 | + CEPH_OSD_MAX_OPS)) { |
---|
976 | 1008 | redirty_page_for_writepage(wbc, page); |
---|
977 | 1009 | unlock_page(page); |
---|
978 | 1010 | break; |
---|
.. | .. |
---|
1067 | 1099 | offset, len); |
---|
1068 | 1100 | osd_req_op_extent_osd_data_pages(req, op_idx, |
---|
1069 | 1101 | data_pages, len, 0, |
---|
1070 | | - !!pool, false); |
---|
| 1102 | + from_pool, false); |
---|
1071 | 1103 | osd_req_op_extent_update(req, op_idx, len); |
---|
1072 | 1104 | |
---|
1073 | 1105 | len = 0; |
---|
.. | .. |
---|
1094 | 1126 | dout("writepages got pages at %llu~%llu\n", offset, len); |
---|
1095 | 1127 | |
---|
1096 | 1128 | osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, |
---|
1097 | | - 0, !!pool, false); |
---|
| 1129 | + 0, from_pool, false); |
---|
1098 | 1130 | osd_req_op_extent_update(req, op_idx, len); |
---|
1099 | 1131 | |
---|
1100 | 1132 | BUG_ON(op_idx + 1 != req->r_num_ops); |
---|
1101 | 1133 | |
---|
1102 | | - pool = NULL; |
---|
| 1134 | + from_pool = false; |
---|
1103 | 1135 | if (i < locked_pages) { |
---|
1104 | 1136 | BUG_ON(num_ops <= req->r_num_ops); |
---|
1105 | 1137 | num_ops -= req->r_num_ops; |
---|
.. | .. |
---|
1110 | 1142 | pages = kmalloc_array(locked_pages, sizeof(*pages), |
---|
1111 | 1143 | GFP_NOFS); |
---|
1112 | 1144 | if (!pages) { |
---|
1113 | | - pool = fsc->wb_pagevec_pool; |
---|
1114 | | - pages = mempool_alloc(pool, GFP_NOFS); |
---|
| 1145 | + from_pool = true; |
---|
| 1146 | + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); |
---|
1115 | 1147 | BUG_ON(!pages); |
---|
1116 | 1148 | } |
---|
1117 | 1149 | memcpy(pages, data_pages + i, |
---|
.. | .. |
---|
1206 | 1238 | return ret; |
---|
1207 | 1239 | } |
---|
1208 | 1240 | |
---|
1209 | | -/* |
---|
1210 | | - * We are only allowed to write into/dirty the page if the page is |
---|
1211 | | - * clean, or already dirty within the same snap context. |
---|
| 1241 | +/** |
---|
| 1242 | + * ceph_find_incompatible - find an incompatible context and return it |
---|
| 1243 | + * @page: page being dirtied |
---|
1212 | 1244 | * |
---|
1213 | | - * called with page locked. |
---|
1214 | | - * return success with page locked, |
---|
1215 | | - * or any failure (incl -EAGAIN) with page unlocked. |
---|
| 1245 | + * We are only allowed to write into/dirty a page if the page is |
---|
| 1246 | + * clean, or already dirty within the same snap context. Returns a |
---|
| 1247 | + * conflicting context if there is one, NULL if there isn't, or a |
---|
| 1248 | + * negative error code on other errors. |
---|
| 1249 | + * |
---|
| 1250 | + * Must be called with page lock held. |
---|
1216 | 1251 | */ |
---|
1217 | | -static int ceph_update_writeable_page(struct file *file, |
---|
1218 | | - loff_t pos, unsigned len, |
---|
1219 | | - struct page *page) |
---|
| 1252 | +static struct ceph_snap_context * |
---|
| 1253 | +ceph_find_incompatible(struct page *page) |
---|
1220 | 1254 | { |
---|
1221 | | - struct inode *inode = file_inode(file); |
---|
| 1255 | + struct inode *inode = page->mapping->host; |
---|
1222 | 1256 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
1223 | 1257 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
1224 | | - loff_t page_off = pos & PAGE_MASK; |
---|
1225 | | - int pos_in_page = pos & ~PAGE_MASK; |
---|
1226 | | - int end_in_page = pos_in_page + len; |
---|
1227 | | - loff_t i_size; |
---|
1228 | | - int r; |
---|
1229 | | - struct ceph_snap_context *snapc, *oldest; |
---|
1230 | 1258 | |
---|
1231 | 1259 | if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
---|
1232 | 1260 | dout(" page %p forced umount\n", page); |
---|
1233 | | - unlock_page(page); |
---|
1234 | | - return -EIO; |
---|
| 1261 | + return ERR_PTR(-EIO); |
---|
1235 | 1262 | } |
---|
1236 | 1263 | |
---|
1237 | | -retry_locked: |
---|
1238 | | - /* writepages currently holds page lock, but if we change that later, */ |
---|
1239 | | - wait_on_page_writeback(page); |
---|
| 1264 | + for (;;) { |
---|
| 1265 | + struct ceph_snap_context *snapc, *oldest; |
---|
1240 | 1266 | |
---|
1241 | | - snapc = page_snap_context(page); |
---|
1242 | | - if (snapc && snapc != ci->i_head_snapc) { |
---|
| 1267 | + wait_on_page_writeback(page); |
---|
| 1268 | + |
---|
| 1269 | + snapc = page_snap_context(page); |
---|
| 1270 | + if (!snapc || snapc == ci->i_head_snapc) |
---|
| 1271 | + break; |
---|
| 1272 | + |
---|
1243 | 1273 | /* |
---|
1244 | 1274 | * this page is already dirty in another (older) snap |
---|
1245 | 1275 | * context! is it writeable now? |
---|
1246 | 1276 | */ |
---|
1247 | 1277 | oldest = get_oldest_context(inode, NULL, NULL); |
---|
1248 | 1278 | if (snapc->seq > oldest->seq) { |
---|
| 1279 | + /* not writeable -- return it for the caller to deal with */ |
---|
1249 | 1280 | ceph_put_snap_context(oldest); |
---|
1250 | | - dout(" page %p snapc %p not current or oldest\n", |
---|
1251 | | - page, snapc); |
---|
1252 | | - /* |
---|
1253 | | - * queue for writeback, and wait for snapc to |
---|
1254 | | - * be writeable or written |
---|
1255 | | - */ |
---|
1256 | | - snapc = ceph_get_snap_context(snapc); |
---|
1257 | | - unlock_page(page); |
---|
1258 | | - ceph_queue_writeback(inode); |
---|
1259 | | - r = wait_event_killable(ci->i_cap_wq, |
---|
1260 | | - context_is_writeable_or_written(inode, snapc)); |
---|
1261 | | - ceph_put_snap_context(snapc); |
---|
1262 | | - if (r == -ERESTARTSYS) |
---|
1263 | | - return r; |
---|
1264 | | - return -EAGAIN; |
---|
| 1281 | + dout(" page %p snapc %p not current or oldest\n", page, snapc); |
---|
| 1282 | + return ceph_get_snap_context(snapc); |
---|
1265 | 1283 | } |
---|
1266 | 1284 | ceph_put_snap_context(oldest); |
---|
1267 | 1285 | |
---|
1268 | 1286 | /* yay, writeable, do it now (without dropping page lock) */ |
---|
1269 | | - dout(" page %p snapc %p not current, but oldest\n", |
---|
1270 | | - page, snapc); |
---|
1271 | | - if (!clear_page_dirty_for_io(page)) |
---|
1272 | | - goto retry_locked; |
---|
1273 | | - r = writepage_nounlock(page, NULL); |
---|
1274 | | - if (r < 0) |
---|
1275 | | - goto fail_unlock; |
---|
1276 | | - goto retry_locked; |
---|
| 1287 | + dout(" page %p snapc %p not current, but oldest\n", page, snapc); |
---|
| 1288 | + if (clear_page_dirty_for_io(page)) { |
---|
| 1289 | + int r = writepage_nounlock(page, NULL); |
---|
| 1290 | + if (r < 0) |
---|
| 1291 | + return ERR_PTR(r); |
---|
| 1292 | + } |
---|
1277 | 1293 | } |
---|
| 1294 | + return NULL; |
---|
| 1295 | +} |
---|
1278 | 1296 | |
---|
1279 | | - if (PageUptodate(page)) { |
---|
1280 | | - dout(" page %p already uptodate\n", page); |
---|
1281 | | - return 0; |
---|
1282 | | - } |
---|
| 1297 | +/** |
---|
| 1298 | + * prep_noread_page - prep a page for writing without reading first |
---|
| 1299 | + * @page: page being prepared |
---|
| 1300 | + * @pos: starting position for the write |
---|
| 1301 | + * @len: length of write |
---|
| 1302 | + * |
---|
| 1303 | + * In some cases, write_begin doesn't need to read at all: |
---|
| 1304 | + * - full page write |
---|
| 1305 | + * - file is currently zero-length |
---|
| 1306 | + * - write that lies in a page that is completely beyond EOF |
---|
| 1307 | + * - write that covers the the page from start to EOF or beyond it |
---|
| 1308 | + * |
---|
| 1309 | + * If any of these criteria are met, then zero out the unwritten parts |
---|
| 1310 | + * of the page and return true. Otherwise, return false. |
---|
| 1311 | + */ |
---|
| 1312 | +static bool skip_page_read(struct page *page, loff_t pos, size_t len) |
---|
| 1313 | +{ |
---|
| 1314 | + struct inode *inode = page->mapping->host; |
---|
| 1315 | + loff_t i_size = i_size_read(inode); |
---|
| 1316 | + size_t offset = offset_in_page(pos); |
---|
1283 | 1317 | |
---|
1284 | | - /* full page? */ |
---|
1285 | | - if (pos_in_page == 0 && len == PAGE_SIZE) |
---|
1286 | | - return 0; |
---|
| 1318 | + /* Full page write */ |
---|
| 1319 | + if (offset == 0 && len >= PAGE_SIZE) |
---|
| 1320 | + return true; |
---|
1287 | 1321 | |
---|
1288 | | - /* past end of file? */ |
---|
1289 | | - i_size = i_size_read(inode); |
---|
| 1322 | + /* pos beyond last page in the file */ |
---|
| 1323 | + if (pos - offset >= i_size) |
---|
| 1324 | + goto zero_out; |
---|
1290 | 1325 | |
---|
1291 | | - if (page_off >= i_size || |
---|
1292 | | - (pos_in_page == 0 && (pos+len) >= i_size && |
---|
1293 | | - end_in_page - pos_in_page != PAGE_SIZE)) { |
---|
1294 | | - dout(" zeroing %p 0 - %d and %d - %d\n", |
---|
1295 | | - page, pos_in_page, end_in_page, (int)PAGE_SIZE); |
---|
1296 | | - zero_user_segments(page, |
---|
1297 | | - 0, pos_in_page, |
---|
1298 | | - end_in_page, PAGE_SIZE); |
---|
1299 | | - return 0; |
---|
1300 | | - } |
---|
| 1326 | + /* write that covers the whole page from start to EOF or beyond it */ |
---|
| 1327 | + if (offset == 0 && (pos + len) >= i_size) |
---|
| 1328 | + goto zero_out; |
---|
1301 | 1329 | |
---|
1302 | | - /* we need to read it. */ |
---|
1303 | | - r = ceph_do_readpage(file, page); |
---|
1304 | | - if (r < 0) { |
---|
1305 | | - if (r == -EINPROGRESS) |
---|
1306 | | - return -EAGAIN; |
---|
1307 | | - goto fail_unlock; |
---|
1308 | | - } |
---|
1309 | | - goto retry_locked; |
---|
1310 | | -fail_unlock: |
---|
1311 | | - unlock_page(page); |
---|
1312 | | - return r; |
---|
| 1330 | + return false; |
---|
| 1331 | +zero_out: |
---|
| 1332 | + zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); |
---|
| 1333 | + return true; |
---|
1313 | 1334 | } |
---|
1314 | 1335 | |
---|
1315 | 1336 | /* |
---|
.. | .. |
---|
1321 | 1342 | struct page **pagep, void **fsdata) |
---|
1322 | 1343 | { |
---|
1323 | 1344 | struct inode *inode = file_inode(file); |
---|
1324 | | - struct page *page; |
---|
| 1345 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
| 1346 | + struct ceph_snap_context *snapc; |
---|
| 1347 | + struct page *page = NULL; |
---|
1325 | 1348 | pgoff_t index = pos >> PAGE_SHIFT; |
---|
1326 | | - int r; |
---|
| 1349 | + int r = 0; |
---|
1327 | 1350 | |
---|
1328 | | - do { |
---|
1329 | | - /* get a page */ |
---|
| 1351 | + dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); |
---|
| 1352 | + |
---|
| 1353 | + for (;;) { |
---|
1330 | 1354 | page = grab_cache_page_write_begin(mapping, index, 0); |
---|
1331 | | - if (!page) |
---|
1332 | | - return -ENOMEM; |
---|
| 1355 | + if (!page) { |
---|
| 1356 | + r = -ENOMEM; |
---|
| 1357 | + break; |
---|
| 1358 | + } |
---|
1333 | 1359 | |
---|
1334 | | - dout("write_begin file %p inode %p page %p %d~%d\n", file, |
---|
1335 | | - inode, page, (int)pos, (int)len); |
---|
1336 | | - |
---|
1337 | | - r = ceph_update_writeable_page(file, pos, len, page); |
---|
1338 | | - if (r < 0) |
---|
| 1360 | + snapc = ceph_find_incompatible(page); |
---|
| 1361 | + if (snapc) { |
---|
| 1362 | + if (IS_ERR(snapc)) { |
---|
| 1363 | + r = PTR_ERR(snapc); |
---|
| 1364 | + break; |
---|
| 1365 | + } |
---|
| 1366 | + unlock_page(page); |
---|
1339 | 1367 | put_page(page); |
---|
1340 | | - else |
---|
1341 | | - *pagep = page; |
---|
1342 | | - } while (r == -EAGAIN); |
---|
| 1368 | + page = NULL; |
---|
| 1369 | + ceph_queue_writeback(inode); |
---|
| 1370 | + r = wait_event_killable(ci->i_cap_wq, |
---|
| 1371 | + context_is_writeable_or_written(inode, snapc)); |
---|
| 1372 | + ceph_put_snap_context(snapc); |
---|
| 1373 | + if (r != 0) |
---|
| 1374 | + break; |
---|
| 1375 | + continue; |
---|
| 1376 | + } |
---|
1343 | 1377 | |
---|
| 1378 | + if (PageUptodate(page)) { |
---|
| 1379 | + dout(" page %p already uptodate\n", page); |
---|
| 1380 | + break; |
---|
| 1381 | + } |
---|
| 1382 | + |
---|
| 1383 | + /* No need to read in some cases */ |
---|
| 1384 | + if (skip_page_read(page, pos, len)) |
---|
| 1385 | + break; |
---|
| 1386 | + |
---|
| 1387 | + /* |
---|
| 1388 | + * We need to read it. If we get back -EINPROGRESS, then the page was |
---|
| 1389 | + * handed off to fscache and it will be unlocked when the read completes. |
---|
| 1390 | + * Refind the page in that case so we can reacquire the page lock. Otherwise |
---|
| 1391 | + * we got a hard error or the read was completed synchronously. |
---|
| 1392 | + */ |
---|
| 1393 | + r = ceph_do_readpage(file, page); |
---|
| 1394 | + if (r != -EINPROGRESS) |
---|
| 1395 | + break; |
---|
| 1396 | + } |
---|
| 1397 | + |
---|
| 1398 | + if (r < 0) { |
---|
| 1399 | + if (page) { |
---|
| 1400 | + unlock_page(page); |
---|
| 1401 | + put_page(page); |
---|
| 1402 | + } |
---|
| 1403 | + } else { |
---|
| 1404 | + *pagep = page; |
---|
| 1405 | + } |
---|
1344 | 1406 | return r; |
---|
1345 | 1407 | } |
---|
1346 | 1408 | |
---|
.. | .. |
---|
1444 | 1506 | want = CEPH_CAP_FILE_CACHE; |
---|
1445 | 1507 | |
---|
1446 | 1508 | got = 0; |
---|
1447 | | - err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); |
---|
| 1509 | + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, |
---|
| 1510 | + &got, &pinned_page); |
---|
1448 | 1511 | if (err < 0) |
---|
1449 | 1512 | goto out_restore; |
---|
1450 | 1513 | |
---|
.. | .. |
---|
1488 | 1551 | if (err < 0 || off >= i_size_read(inode)) { |
---|
1489 | 1552 | unlock_page(page); |
---|
1490 | 1553 | put_page(page); |
---|
1491 | | - if (err == -ENOMEM) |
---|
1492 | | - ret = VM_FAULT_OOM; |
---|
1493 | | - else |
---|
1494 | | - ret = VM_FAULT_SIGBUS; |
---|
| 1554 | + ret = vmf_error(err); |
---|
1495 | 1555 | goto out_inline; |
---|
1496 | 1556 | } |
---|
1497 | 1557 | if (err < PAGE_SIZE) |
---|
.. | .. |
---|
1535 | 1595 | if (!prealloc_cf) |
---|
1536 | 1596 | return VM_FAULT_OOM; |
---|
1537 | 1597 | |
---|
| 1598 | + sb_start_pagefault(inode->i_sb); |
---|
1538 | 1599 | ceph_block_sigs(&oldset); |
---|
1539 | 1600 | |
---|
1540 | 1601 | if (ci->i_inline_version != CEPH_INLINE_NONE) { |
---|
.. | .. |
---|
1563 | 1624 | want = CEPH_CAP_FILE_BUFFER; |
---|
1564 | 1625 | |
---|
1565 | 1626 | got = 0; |
---|
1566 | | - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, |
---|
| 1627 | + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, |
---|
1567 | 1628 | &got, NULL); |
---|
1568 | 1629 | if (err < 0) |
---|
1569 | 1630 | goto out_free; |
---|
.. | .. |
---|
1573 | 1634 | |
---|
1574 | 1635 | /* Update time before taking page lock */ |
---|
1575 | 1636 | file_update_time(vma->vm_file); |
---|
| 1637 | + inode_inc_iversion_raw(inode); |
---|
1576 | 1638 | |
---|
1577 | 1639 | do { |
---|
| 1640 | + struct ceph_snap_context *snapc; |
---|
| 1641 | + |
---|
1578 | 1642 | lock_page(page); |
---|
1579 | 1643 | |
---|
1580 | | - if ((off > size) || (page->mapping != inode->i_mapping)) { |
---|
| 1644 | + if (page_mkwrite_check_truncate(page, inode) < 0) { |
---|
1581 | 1645 | unlock_page(page); |
---|
1582 | 1646 | ret = VM_FAULT_NOPAGE; |
---|
1583 | 1647 | break; |
---|
1584 | 1648 | } |
---|
1585 | 1649 | |
---|
1586 | | - err = ceph_update_writeable_page(vma->vm_file, off, len, page); |
---|
1587 | | - if (err >= 0) { |
---|
| 1650 | + snapc = ceph_find_incompatible(page); |
---|
| 1651 | + if (!snapc) { |
---|
1588 | 1652 | /* success. we'll keep the page locked. */ |
---|
1589 | 1653 | set_page_dirty(page); |
---|
1590 | 1654 | ret = VM_FAULT_LOCKED; |
---|
| 1655 | + break; |
---|
1591 | 1656 | } |
---|
1592 | | - } while (err == -EAGAIN); |
---|
| 1657 | + |
---|
| 1658 | + unlock_page(page); |
---|
| 1659 | + |
---|
| 1660 | + if (IS_ERR(snapc)) { |
---|
| 1661 | + ret = VM_FAULT_SIGBUS; |
---|
| 1662 | + break; |
---|
| 1663 | + } |
---|
| 1664 | + |
---|
| 1665 | + ceph_queue_writeback(inode); |
---|
| 1666 | + err = wait_event_killable(ci->i_cap_wq, |
---|
| 1667 | + context_is_writeable_or_written(inode, snapc)); |
---|
| 1668 | + ceph_put_snap_context(snapc); |
---|
| 1669 | + } while (err == 0); |
---|
1593 | 1670 | |
---|
1594 | 1671 | if (ret == VM_FAULT_LOCKED || |
---|
1595 | 1672 | ci->i_inline_version != CEPH_INLINE_NONE) { |
---|
.. | .. |
---|
1608 | 1685 | ceph_put_cap_refs(ci, got); |
---|
1609 | 1686 | out_free: |
---|
1610 | 1687 | ceph_restore_sigs(&oldset); |
---|
| 1688 | + sb_end_pagefault(inode->i_sb); |
---|
1611 | 1689 | ceph_free_cap_flush(prealloc_cf); |
---|
1612 | 1690 | if (err < 0) |
---|
1613 | 1691 | ret = vmf_error(err); |
---|
.. | .. |
---|
1773 | 1851 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
---|
1774 | 1852 | if (!err) |
---|
1775 | 1853 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); |
---|
| 1854 | + |
---|
| 1855 | + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, |
---|
| 1856 | + req->r_end_latency, err); |
---|
| 1857 | + |
---|
1776 | 1858 | out_put: |
---|
1777 | 1859 | ceph_osdc_put_request(req); |
---|
1778 | 1860 | if (err == -ECANCELED) |
---|
.. | .. |
---|
1940 | 2022 | |
---|
1941 | 2023 | if (err >= 0 || err == -ENOENT) |
---|
1942 | 2024 | have |= POOL_READ; |
---|
1943 | | - else if (err != -EPERM) |
---|
| 2025 | + else if (err != -EPERM) { |
---|
| 2026 | + if (err == -EBLOCKLISTED) |
---|
| 2027 | + fsc->blocklisted = true; |
---|
1944 | 2028 | goto out_unlock; |
---|
| 2029 | + } |
---|
1945 | 2030 | |
---|
1946 | 2031 | if (err2 == 0 || err2 == -EEXIST) |
---|
1947 | 2032 | have |= POOL_WRITE; |
---|
1948 | 2033 | else if (err2 != -EPERM) { |
---|
| 2034 | + if (err2 == -EBLOCKLISTED) |
---|
| 2035 | + fsc->blocklisted = true; |
---|
1949 | 2036 | err = err2; |
---|
1950 | 2037 | goto out_unlock; |
---|
1951 | 2038 | } |
---|
.. | .. |
---|
1983 | 2070 | return err; |
---|
1984 | 2071 | } |
---|
1985 | 2072 | |
---|
1986 | | -int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) |
---|
| 2073 | +int ceph_pool_perm_check(struct inode *inode, int need) |
---|
1987 | 2074 | { |
---|
1988 | | - s64 pool; |
---|
| 2075 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
1989 | 2076 | struct ceph_string *pool_ns; |
---|
| 2077 | + s64 pool; |
---|
1990 | 2078 | int ret, flags; |
---|
1991 | 2079 | |
---|
1992 | 2080 | if (ci->i_vino.snap != CEPH_NOSNAP) { |
---|
.. | .. |
---|
1998 | 2086 | return 0; |
---|
1999 | 2087 | } |
---|
2000 | 2088 | |
---|
2001 | | - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), |
---|
| 2089 | + if (ceph_test_mount_opt(ceph_inode_to_client(inode), |
---|
2002 | 2090 | NOPOOLPERM)) |
---|
2003 | 2091 | return 0; |
---|
2004 | 2092 | |
---|