.. | .. |
---|
8 | 8 | #include <linux/vmalloc.h> |
---|
9 | 9 | #include <linux/wait.h> |
---|
10 | 10 | #include <linux/writeback.h> |
---|
| 11 | +#include <linux/iversion.h> |
---|
11 | 12 | |
---|
12 | 13 | #include "super.h" |
---|
13 | 14 | #include "mds_client.h" |
---|
.. | .. |
---|
148 | 149 | spin_unlock(&mdsc->caps_list_lock); |
---|
149 | 150 | } |
---|
150 | 151 | |
---|
151 | | -void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) |
---|
| 152 | +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, |
---|
| 153 | + struct ceph_mount_options *fsopt) |
---|
152 | 154 | { |
---|
153 | 155 | spin_lock(&mdsc->caps_list_lock); |
---|
154 | | - mdsc->caps_min_count += delta; |
---|
155 | | - BUG_ON(mdsc->caps_min_count < 0); |
---|
| 156 | + mdsc->caps_min_count = fsopt->max_readdir; |
---|
| 157 | + if (mdsc->caps_min_count < 1024) |
---|
| 158 | + mdsc->caps_min_count = 1024; |
---|
| 159 | + mdsc->caps_use_max = fsopt->caps_max; |
---|
| 160 | + if (mdsc->caps_use_max > 0 && |
---|
| 161 | + mdsc->caps_use_max < mdsc->caps_min_count) |
---|
| 162 | + mdsc->caps_use_max = mdsc->caps_min_count; |
---|
156 | 163 | spin_unlock(&mdsc->caps_list_lock); |
---|
157 | 164 | } |
---|
158 | 165 | |
---|
.. | .. |
---|
272 | 279 | if (!err) { |
---|
273 | 280 | BUG_ON(have + alloc != need); |
---|
274 | 281 | ctx->count = need; |
---|
| 282 | + ctx->used = 0; |
---|
275 | 283 | } |
---|
276 | 284 | |
---|
277 | 285 | spin_lock(&mdsc->caps_list_lock); |
---|
.. | .. |
---|
295 | 303 | } |
---|
296 | 304 | |
---|
297 | 305 | void ceph_unreserve_caps(struct ceph_mds_client *mdsc, |
---|
298 | | - struct ceph_cap_reservation *ctx) |
---|
| 306 | + struct ceph_cap_reservation *ctx) |
---|
299 | 307 | { |
---|
| 308 | + bool reclaim = false; |
---|
| 309 | + if (!ctx->count) |
---|
| 310 | + return; |
---|
| 311 | + |
---|
300 | 312 | dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); |
---|
301 | 313 | spin_lock(&mdsc->caps_list_lock); |
---|
302 | 314 | __ceph_unreserve_caps(mdsc, ctx->count); |
---|
303 | 315 | ctx->count = 0; |
---|
| 316 | + |
---|
| 317 | + if (mdsc->caps_use_max > 0 && |
---|
| 318 | + mdsc->caps_use_count > mdsc->caps_use_max) |
---|
| 319 | + reclaim = true; |
---|
304 | 320 | spin_unlock(&mdsc->caps_list_lock); |
---|
| 321 | + |
---|
| 322 | + if (reclaim) |
---|
| 323 | + ceph_reclaim_caps_nr(mdsc, ctx->used); |
---|
305 | 324 | } |
---|
306 | 325 | |
---|
307 | 326 | struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, |
---|
.. | .. |
---|
346 | 365 | BUG_ON(list_empty(&mdsc->caps_list)); |
---|
347 | 366 | |
---|
348 | 367 | ctx->count--; |
---|
| 368 | + ctx->used++; |
---|
349 | 369 | mdsc->caps_reserve_count--; |
---|
350 | 370 | mdsc->caps_use_count++; |
---|
351 | 371 | |
---|
.. | .. |
---|
438 | 458 | } |
---|
439 | 459 | |
---|
440 | 460 | /* |
---|
441 | | - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. |
---|
442 | | - */ |
---|
443 | | -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) |
---|
444 | | -{ |
---|
445 | | - struct ceph_cap *cap; |
---|
446 | | - int mds = -1; |
---|
447 | | - struct rb_node *p; |
---|
448 | | - |
---|
449 | | - /* prefer mds with WR|BUFFER|EXCL caps */ |
---|
450 | | - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
---|
451 | | - cap = rb_entry(p, struct ceph_cap, ci_node); |
---|
452 | | - mds = cap->mds; |
---|
453 | | - if (cap->issued & (CEPH_CAP_FILE_WR | |
---|
454 | | - CEPH_CAP_FILE_BUFFER | |
---|
455 | | - CEPH_CAP_FILE_EXCL)) |
---|
456 | | - break; |
---|
457 | | - } |
---|
458 | | - return mds; |
---|
459 | | -} |
---|
460 | | - |
---|
461 | | -int ceph_get_cap_mds(struct inode *inode) |
---|
462 | | -{ |
---|
463 | | - struct ceph_inode_info *ci = ceph_inode(inode); |
---|
464 | | - int mds; |
---|
465 | | - spin_lock(&ci->i_ceph_lock); |
---|
466 | | - mds = __ceph_get_cap_mds(ceph_inode(inode)); |
---|
467 | | - spin_unlock(&ci->i_ceph_lock); |
---|
468 | | - return mds; |
---|
469 | | -} |
---|
470 | | - |
---|
471 | | -/* |
---|
472 | 461 | * Called under i_ceph_lock. |
---|
473 | 462 | */ |
---|
474 | 463 | static void __insert_cap_node(struct ceph_inode_info *ci, |
---|
.. | .. |
---|
500 | 489 | static void __cap_set_timeouts(struct ceph_mds_client *mdsc, |
---|
501 | 490 | struct ceph_inode_info *ci) |
---|
502 | 491 | { |
---|
503 | | - struct ceph_mount_options *ma = mdsc->fsc->mount_options; |
---|
504 | | - |
---|
505 | | - ci->i_hold_caps_min = round_jiffies(jiffies + |
---|
506 | | - ma->caps_wanted_delay_min * HZ); |
---|
| 492 | + struct ceph_mount_options *opt = mdsc->fsc->mount_options; |
---|
507 | 493 | ci->i_hold_caps_max = round_jiffies(jiffies + |
---|
508 | | - ma->caps_wanted_delay_max * HZ); |
---|
509 | | - dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, |
---|
510 | | - ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); |
---|
| 494 | + opt->caps_wanted_delay_max * HZ); |
---|
| 495 | + dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, |
---|
| 496 | + ci->i_hold_caps_max - jiffies); |
---|
511 | 497 | } |
---|
512 | 498 | |
---|
513 | 499 | /* |
---|
.. | .. |
---|
521 | 507 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, |
---|
522 | 508 | struct ceph_inode_info *ci) |
---|
523 | 509 | { |
---|
524 | | - __cap_set_timeouts(mdsc, ci); |
---|
525 | | - dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, |
---|
| 510 | + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, |
---|
526 | 511 | ci->i_ceph_flags, ci->i_hold_caps_max); |
---|
527 | 512 | if (!mdsc->stopping) { |
---|
528 | 513 | spin_lock(&mdsc->cap_delay_lock); |
---|
.. | .. |
---|
531 | 516 | goto no_change; |
---|
532 | 517 | list_del_init(&ci->i_cap_delay_list); |
---|
533 | 518 | } |
---|
| 519 | + __cap_set_timeouts(mdsc, ci); |
---|
534 | 520 | list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); |
---|
535 | 521 | no_change: |
---|
536 | 522 | spin_unlock(&mdsc->cap_delay_lock); |
---|
.. | .. |
---|
570 | 556 | spin_unlock(&mdsc->cap_delay_lock); |
---|
571 | 557 | } |
---|
572 | 558 | |
---|
573 | | -/* |
---|
574 | | - * Common issue checks for add_cap, handle_cap_grant. |
---|
575 | | - */ |
---|
| 559 | +/* Common issue checks for add_cap, handle_cap_grant. */ |
---|
576 | 560 | static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, |
---|
577 | 561 | unsigned issued) |
---|
578 | 562 | { |
---|
579 | 563 | unsigned had = __ceph_caps_issued(ci, NULL); |
---|
580 | 564 | |
---|
| 565 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
| 566 | + |
---|
581 | 567 | /* |
---|
582 | 568 | * Each time we receive FILE_CACHE anew, we increment |
---|
583 | 569 | * i_rdcache_gen. |
---|
584 | 570 | */ |
---|
585 | | - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
---|
| 571 | + if (S_ISREG(ci->vfs_inode.i_mode) && |
---|
| 572 | + (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
---|
586 | 573 | (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { |
---|
587 | 574 | ci->i_rdcache_gen++; |
---|
588 | 575 | } |
---|
.. | .. |
---|
601 | 588 | __ceph_dir_clear_complete(ci); |
---|
602 | 589 | } |
---|
603 | 590 | } |
---|
| 591 | + |
---|
| 592 | + /* Wipe saved layout if we're losing DIR_CREATE caps */ |
---|
| 593 | + if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && |
---|
| 594 | + !(issued & CEPH_CAP_DIR_CREATE)) { |
---|
| 595 | + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); |
---|
| 596 | + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); |
---|
| 597 | + } |
---|
| 598 | +} |
---|
| 599 | + |
---|
| 600 | +/** |
---|
| 601 | + * change_auth_cap_ses - move inode to appropriate lists when auth caps change |
---|
| 602 | + * @ci: inode to be moved |
---|
| 603 | + * @session: new auth caps session |
---|
| 604 | + */ |
---|
| 605 | +static void change_auth_cap_ses(struct ceph_inode_info *ci, |
---|
| 606 | + struct ceph_mds_session *session) |
---|
| 607 | +{ |
---|
| 608 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
| 609 | + |
---|
| 610 | + if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) |
---|
| 611 | + return; |
---|
| 612 | + |
---|
| 613 | + spin_lock(&session->s_mdsc->cap_dirty_lock); |
---|
| 614 | + if (!list_empty(&ci->i_dirty_item)) |
---|
| 615 | + list_move(&ci->i_dirty_item, &session->s_cap_dirty); |
---|
| 616 | + if (!list_empty(&ci->i_flushing_item)) |
---|
| 617 | + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
---|
| 618 | + spin_unlock(&session->s_mdsc->cap_dirty_lock); |
---|
604 | 619 | } |
---|
605 | 620 | |
---|
606 | 621 | /* |
---|
607 | 622 | * Add a capability under the given MDS session. |
---|
608 | 623 | * |
---|
609 | | - * Caller should hold session snap_rwsem (read) and s_mutex. |
---|
| 624 | + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock |
---|
610 | 625 | * |
---|
611 | 626 | * @fmode is the open file mode, if we are opening a file, otherwise |
---|
612 | 627 | * it is < 0. (This is so we can atomically add the cap and add an |
---|
.. | .. |
---|
614 | 629 | */ |
---|
615 | 630 | void ceph_add_cap(struct inode *inode, |
---|
616 | 631 | struct ceph_mds_session *session, u64 cap_id, |
---|
617 | | - int fmode, unsigned issued, unsigned wanted, |
---|
| 632 | + unsigned issued, unsigned wanted, |
---|
618 | 633 | unsigned seq, unsigned mseq, u64 realmino, int flags, |
---|
619 | 634 | struct ceph_cap **new_cap) |
---|
620 | 635 | { |
---|
.. | .. |
---|
623 | 638 | struct ceph_cap *cap; |
---|
624 | 639 | int mds = session->s_mds; |
---|
625 | 640 | int actual_wanted; |
---|
| 641 | + u32 gen; |
---|
| 642 | + |
---|
| 643 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
626 | 644 | |
---|
627 | 645 | dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, |
---|
628 | 646 | session->s_mds, cap_id, ceph_cap_string(issued), seq); |
---|
629 | 647 | |
---|
630 | | - /* |
---|
631 | | - * If we are opening the file, include file mode wanted bits |
---|
632 | | - * in wanted. |
---|
633 | | - */ |
---|
634 | | - if (fmode >= 0) |
---|
635 | | - wanted |= ceph_caps_for_mode(fmode); |
---|
| 648 | + spin_lock(&session->s_gen_ttl_lock); |
---|
| 649 | + gen = session->s_cap_gen; |
---|
| 650 | + spin_unlock(&session->s_gen_ttl_lock); |
---|
636 | 651 | |
---|
637 | 652 | cap = __get_cap_for_mds(ci, mds); |
---|
638 | 653 | if (!cap) { |
---|
.. | .. |
---|
653 | 668 | spin_lock(&session->s_cap_lock); |
---|
654 | 669 | list_add_tail(&cap->session_caps, &session->s_caps); |
---|
655 | 670 | session->s_nr_caps++; |
---|
| 671 | + atomic64_inc(&mdsc->metric.total_caps); |
---|
656 | 672 | spin_unlock(&session->s_cap_lock); |
---|
657 | 673 | } else { |
---|
| 674 | + spin_lock(&session->s_cap_lock); |
---|
| 675 | + list_move_tail(&cap->session_caps, &session->s_caps); |
---|
| 676 | + spin_unlock(&session->s_cap_lock); |
---|
| 677 | + |
---|
| 678 | + if (cap->cap_gen < gen) |
---|
| 679 | + cap->issued = cap->implemented = CEPH_CAP_PIN; |
---|
| 680 | + |
---|
658 | 681 | /* |
---|
659 | 682 | * auth mds of the inode changed. we received the cap export |
---|
660 | 683 | * message, but still haven't received the cap import message. |
---|
.. | .. |
---|
726 | 749 | if (flags & CEPH_CAP_FLAG_AUTH) { |
---|
727 | 750 | if (!ci->i_auth_cap || |
---|
728 | 751 | ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { |
---|
| 752 | + if (ci->i_auth_cap && |
---|
| 753 | + ci->i_auth_cap->session != cap->session) |
---|
| 754 | + change_auth_cap_ses(ci, cap->session); |
---|
729 | 755 | ci->i_auth_cap = cap; |
---|
730 | 756 | cap->mds_wanted = wanted; |
---|
731 | 757 | } |
---|
.. | .. |
---|
746 | 772 | cap->seq = seq; |
---|
747 | 773 | cap->issue_seq = seq; |
---|
748 | 774 | cap->mseq = mseq; |
---|
749 | | - cap->cap_gen = session->s_cap_gen; |
---|
750 | | - |
---|
751 | | - if (fmode >= 0) |
---|
752 | | - __ceph_get_fmode(ci, fmode); |
---|
| 775 | + cap->cap_gen = gen; |
---|
753 | 776 | } |
---|
754 | 777 | |
---|
755 | 778 | /* |
---|
.. | .. |
---|
864 | 887 | int have = ci->i_snap_caps; |
---|
865 | 888 | |
---|
866 | 889 | if ((have & mask) == mask) { |
---|
867 | | - dout("__ceph_caps_issued_mask %p snap issued %s" |
---|
868 | | - " (mask %s)\n", &ci->vfs_inode, |
---|
| 890 | + dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" |
---|
| 891 | + " (mask %s)\n", ceph_ino(&ci->vfs_inode), |
---|
869 | 892 | ceph_cap_string(have), |
---|
870 | 893 | ceph_cap_string(mask)); |
---|
871 | 894 | return 1; |
---|
.. | .. |
---|
876 | 899 | if (!__cap_is_valid(cap)) |
---|
877 | 900 | continue; |
---|
878 | 901 | if ((cap->issued & mask) == mask) { |
---|
879 | | - dout("__ceph_caps_issued_mask %p cap %p issued %s" |
---|
880 | | - " (mask %s)\n", &ci->vfs_inode, cap, |
---|
| 902 | + dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" |
---|
| 903 | + " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, |
---|
881 | 904 | ceph_cap_string(cap->issued), |
---|
882 | 905 | ceph_cap_string(mask)); |
---|
883 | 906 | if (touch) |
---|
.. | .. |
---|
888 | 911 | /* does a combination of caps satisfy mask? */ |
---|
889 | 912 | have |= cap->issued; |
---|
890 | 913 | if ((have & mask) == mask) { |
---|
891 | | - dout("__ceph_caps_issued_mask %p combo issued %s" |
---|
892 | | - " (mask %s)\n", &ci->vfs_inode, |
---|
| 914 | + dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" |
---|
| 915 | + " (mask %s)\n", ceph_ino(&ci->vfs_inode), |
---|
893 | 916 | ceph_cap_string(cap->issued), |
---|
894 | 917 | ceph_cap_string(mask)); |
---|
895 | 918 | if (touch) { |
---|
.. | .. |
---|
903 | 926 | ci_node); |
---|
904 | 927 | if (!__cap_is_valid(cap)) |
---|
905 | 928 | continue; |
---|
906 | | - __touch_cap(cap); |
---|
| 929 | + if (cap->issued & mask) |
---|
| 930 | + __touch_cap(cap); |
---|
907 | 931 | } |
---|
908 | 932 | } |
---|
909 | 933 | return 1; |
---|
.. | .. |
---|
911 | 935 | } |
---|
912 | 936 | |
---|
913 | 937 | return 0; |
---|
| 938 | +} |
---|
| 939 | + |
---|
| 940 | +int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, |
---|
| 941 | + int touch) |
---|
| 942 | +{ |
---|
| 943 | + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); |
---|
| 944 | + int r; |
---|
| 945 | + |
---|
| 946 | + r = __ceph_caps_issued_mask(ci, mask, touch); |
---|
| 947 | + if (r) |
---|
| 948 | + ceph_update_cap_hit(&fsc->mdsc->metric); |
---|
| 949 | + else |
---|
| 950 | + ceph_update_cap_mis(&fsc->mdsc->metric); |
---|
| 951 | + return r; |
---|
914 | 952 | } |
---|
915 | 953 | |
---|
916 | 954 | /* |
---|
.. | .. |
---|
952 | 990 | if (ci->i_rd_ref) |
---|
953 | 991 | used |= CEPH_CAP_FILE_RD; |
---|
954 | 992 | if (ci->i_rdcache_ref || |
---|
955 | | - (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ |
---|
| 993 | + (S_ISREG(ci->vfs_inode.i_mode) && |
---|
956 | 994 | ci->vfs_inode.i_data.nrpages)) |
---|
957 | 995 | used |= CEPH_CAP_FILE_CACHE; |
---|
958 | 996 | if (ci->i_wr_ref) |
---|
959 | 997 | used |= CEPH_CAP_FILE_WR; |
---|
960 | 998 | if (ci->i_wb_ref || ci->i_wrbuffer_ref) |
---|
961 | 999 | used |= CEPH_CAP_FILE_BUFFER; |
---|
| 1000 | + if (ci->i_fx_ref) |
---|
| 1001 | + used |= CEPH_CAP_FILE_EXCL; |
---|
962 | 1002 | return used; |
---|
963 | 1003 | } |
---|
| 1004 | + |
---|
| 1005 | +#define FMODE_WAIT_BIAS 1000 |
---|
964 | 1006 | |
---|
965 | 1007 | /* |
---|
966 | 1008 | * wanted, by virtue of open file modes |
---|
967 | 1009 | */ |
---|
968 | 1010 | int __ceph_caps_file_wanted(struct ceph_inode_info *ci) |
---|
969 | 1011 | { |
---|
970 | | - int i, bits = 0; |
---|
971 | | - for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { |
---|
972 | | - if (ci->i_nr_by_mode[i]) |
---|
973 | | - bits |= 1 << i; |
---|
| 1012 | + const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); |
---|
| 1013 | + const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); |
---|
| 1014 | + const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); |
---|
| 1015 | + const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); |
---|
| 1016 | + struct ceph_mount_options *opt = |
---|
| 1017 | + ceph_inode_to_client(&ci->vfs_inode)->mount_options; |
---|
| 1018 | + unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; |
---|
| 1019 | + unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; |
---|
| 1020 | + |
---|
| 1021 | + if (S_ISDIR(ci->vfs_inode.i_mode)) { |
---|
| 1022 | + int want = 0; |
---|
| 1023 | + |
---|
| 1024 | + /* use used_cutoff here, to keep dir's wanted caps longer */ |
---|
| 1025 | + if (ci->i_nr_by_mode[RD_SHIFT] > 0 || |
---|
| 1026 | + time_after(ci->i_last_rd, used_cutoff)) |
---|
| 1027 | + want |= CEPH_CAP_ANY_SHARED; |
---|
| 1028 | + |
---|
| 1029 | + if (ci->i_nr_by_mode[WR_SHIFT] > 0 || |
---|
| 1030 | + time_after(ci->i_last_wr, used_cutoff)) { |
---|
| 1031 | + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; |
---|
| 1032 | + if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) |
---|
| 1033 | + want |= CEPH_CAP_ANY_DIR_OPS; |
---|
| 1034 | + } |
---|
| 1035 | + |
---|
| 1036 | + if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) |
---|
| 1037 | + want |= CEPH_CAP_PIN; |
---|
| 1038 | + |
---|
| 1039 | + return want; |
---|
| 1040 | + } else { |
---|
| 1041 | + int bits = 0; |
---|
| 1042 | + |
---|
| 1043 | + if (ci->i_nr_by_mode[RD_SHIFT] > 0) { |
---|
| 1044 | + if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || |
---|
| 1045 | + time_after(ci->i_last_rd, used_cutoff)) |
---|
| 1046 | + bits |= 1 << RD_SHIFT; |
---|
| 1047 | + } else if (time_after(ci->i_last_rd, idle_cutoff)) { |
---|
| 1048 | + bits |= 1 << RD_SHIFT; |
---|
| 1049 | + } |
---|
| 1050 | + |
---|
| 1051 | + if (ci->i_nr_by_mode[WR_SHIFT] > 0) { |
---|
| 1052 | + if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || |
---|
| 1053 | + time_after(ci->i_last_wr, used_cutoff)) |
---|
| 1054 | + bits |= 1 << WR_SHIFT; |
---|
| 1055 | + } else if (time_after(ci->i_last_wr, idle_cutoff)) { |
---|
| 1056 | + bits |= 1 << WR_SHIFT; |
---|
| 1057 | + } |
---|
| 1058 | + |
---|
| 1059 | + /* check lazyio only when read/write is wanted */ |
---|
| 1060 | + if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && |
---|
| 1061 | + ci->i_nr_by_mode[LAZY_SHIFT] > 0) |
---|
| 1062 | + bits |= 1 << LAZY_SHIFT; |
---|
| 1063 | + |
---|
| 1064 | + return bits ? ceph_caps_for_mode(bits >> 1) : 0; |
---|
974 | 1065 | } |
---|
975 | | - if (bits == 0) |
---|
976 | | - return 0; |
---|
977 | | - return ceph_caps_for_mode(bits >> 1); |
---|
| 1066 | +} |
---|
| 1067 | + |
---|
| 1068 | +/* |
---|
| 1069 | + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) |
---|
| 1070 | + */ |
---|
| 1071 | +int __ceph_caps_wanted(struct ceph_inode_info *ci) |
---|
| 1072 | +{ |
---|
| 1073 | + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); |
---|
| 1074 | + if (S_ISDIR(ci->vfs_inode.i_mode)) { |
---|
| 1075 | + /* we want EXCL if holding caps of dir ops */ |
---|
| 1076 | + if (w & CEPH_CAP_ANY_DIR_OPS) |
---|
| 1077 | + w |= CEPH_CAP_FILE_EXCL; |
---|
| 1078 | + } else { |
---|
| 1079 | + /* we want EXCL if dirty data */ |
---|
| 1080 | + if (w & CEPH_CAP_FILE_BUFFER) |
---|
| 1081 | + w |= CEPH_CAP_FILE_EXCL; |
---|
| 1082 | + } |
---|
| 1083 | + return w; |
---|
978 | 1084 | } |
---|
979 | 1085 | |
---|
980 | 1086 | /* |
---|
.. | .. |
---|
998 | 1104 | return mds_wanted; |
---|
999 | 1105 | } |
---|
1000 | 1106 | |
---|
1001 | | -/* |
---|
1002 | | - * called under i_ceph_lock |
---|
1003 | | - */ |
---|
1004 | | -static int __ceph_is_single_caps(struct ceph_inode_info *ci) |
---|
1005 | | -{ |
---|
1006 | | - return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); |
---|
1007 | | -} |
---|
1008 | | - |
---|
1009 | | -static int __ceph_is_any_caps(struct ceph_inode_info *ci) |
---|
1010 | | -{ |
---|
1011 | | - return !RB_EMPTY_ROOT(&ci->i_caps); |
---|
1012 | | -} |
---|
1013 | | - |
---|
1014 | 1107 | int ceph_is_any_caps(struct inode *inode) |
---|
1015 | 1108 | { |
---|
1016 | 1109 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
1017 | 1110 | int ret; |
---|
1018 | 1111 | |
---|
1019 | 1112 | spin_lock(&ci->i_ceph_lock); |
---|
1020 | | - ret = __ceph_is_any_caps(ci); |
---|
| 1113 | + ret = __ceph_is_any_real_caps(ci); |
---|
1021 | 1114 | spin_unlock(&ci->i_ceph_lock); |
---|
1022 | 1115 | |
---|
1023 | 1116 | return ret; |
---|
.. | .. |
---|
1062 | 1155 | |
---|
1063 | 1156 | /* remove from inode's cap rbtree, and clear auth cap */ |
---|
1064 | 1157 | rb_erase(&cap->ci_node, &ci->i_caps); |
---|
1065 | | - if (ci->i_auth_cap == cap) |
---|
| 1158 | + if (ci->i_auth_cap == cap) { |
---|
| 1159 | + WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); |
---|
1066 | 1160 | ci->i_auth_cap = NULL; |
---|
| 1161 | + } |
---|
1067 | 1162 | |
---|
1068 | 1163 | /* remove from session list */ |
---|
1069 | 1164 | spin_lock(&session->s_cap_lock); |
---|
.. | .. |
---|
1074 | 1169 | } else { |
---|
1075 | 1170 | list_del_init(&cap->session_caps); |
---|
1076 | 1171 | session->s_nr_caps--; |
---|
| 1172 | + atomic64_dec(&mdsc->metric.total_caps); |
---|
1077 | 1173 | cap->session = NULL; |
---|
1078 | 1174 | removed = 1; |
---|
1079 | 1175 | } |
---|
.. | .. |
---|
1088 | 1184 | (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { |
---|
1089 | 1185 | cap->queue_release = 1; |
---|
1090 | 1186 | if (removed) { |
---|
1091 | | - list_add_tail(&cap->session_caps, |
---|
1092 | | - &session->s_cap_releases); |
---|
1093 | | - session->s_num_cap_releases++; |
---|
| 1187 | + __ceph_queue_cap_release(session, cap); |
---|
1094 | 1188 | removed = 0; |
---|
1095 | 1189 | } |
---|
1096 | 1190 | } else { |
---|
.. | .. |
---|
1103 | 1197 | if (removed) |
---|
1104 | 1198 | ceph_put_cap(mdsc, cap); |
---|
1105 | 1199 | |
---|
1106 | | - /* when reconnect denied, we remove session caps forcibly, |
---|
1107 | | - * i_wr_ref can be non-zero. If there are ongoing write, |
---|
1108 | | - * keep i_snap_realm. |
---|
1109 | | - */ |
---|
1110 | | - if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) |
---|
1111 | | - drop_inode_snap_realm(ci); |
---|
| 1200 | + if (!__ceph_is_any_real_caps(ci)) { |
---|
| 1201 | + /* when reconnect denied, we remove session caps forcibly, |
---|
| 1202 | + * i_wr_ref can be non-zero. If there are ongoing write, |
---|
| 1203 | + * keep i_snap_realm. |
---|
| 1204 | + */ |
---|
| 1205 | + if (ci->i_wr_ref == 0 && ci->i_snap_realm) |
---|
| 1206 | + drop_inode_snap_realm(ci); |
---|
1112 | 1207 | |
---|
1113 | | - if (!__ceph_is_any_real_caps(ci)) |
---|
1114 | 1208 | __cap_delay_cancel(mdsc, ci); |
---|
| 1209 | + } |
---|
1115 | 1210 | } |
---|
1116 | 1211 | |
---|
1117 | 1212 | struct cap_msg_args { |
---|
.. | .. |
---|
1119 | 1214 | u64 ino, cid, follows; |
---|
1120 | 1215 | u64 flush_tid, oldest_flush_tid, size, max_size; |
---|
1121 | 1216 | u64 xattr_version; |
---|
| 1217 | + u64 change_attr; |
---|
1122 | 1218 | struct ceph_buffer *xattr_buf; |
---|
1123 | | - struct timespec64 atime, mtime, ctime; |
---|
| 1219 | + struct ceph_buffer *old_xattr_buf; |
---|
| 1220 | + struct timespec64 atime, mtime, ctime, btime; |
---|
1124 | 1221 | int op, caps, wanted, dirty; |
---|
1125 | 1222 | u32 seq, issue_seq, mseq, time_warp_seq; |
---|
1126 | 1223 | u32 flags; |
---|
.. | .. |
---|
1128 | 1225 | kgid_t gid; |
---|
1129 | 1226 | umode_t mode; |
---|
1130 | 1227 | bool inline_data; |
---|
| 1228 | + bool wake; |
---|
1131 | 1229 | }; |
---|
1132 | 1230 | |
---|
1133 | 1231 | /* |
---|
1134 | | - * Build and send a cap message to the given MDS. |
---|
1135 | | - * |
---|
1136 | | - * Caller should be holding s_mutex. |
---|
| 1232 | + * cap struct size + flock buffer size + inline version + inline data size + |
---|
| 1233 | + * osd_epoch_barrier + oldest_flush_tid |
---|
1137 | 1234 | */ |
---|
1138 | | -static int send_cap_msg(struct cap_msg_args *arg) |
---|
| 1235 | +#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ |
---|
| 1236 | + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) |
---|
| 1237 | + |
---|
| 1238 | +/* Marshal up the cap msg to the MDS */ |
---|
| 1239 | +static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) |
---|
1139 | 1240 | { |
---|
1140 | 1241 | struct ceph_mds_caps *fc; |
---|
1141 | | - struct ceph_msg *msg; |
---|
1142 | 1242 | void *p; |
---|
1143 | | - size_t extra_len; |
---|
1144 | | - struct timespec64 zerotime = {0}; |
---|
1145 | 1243 | struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; |
---|
1146 | 1244 | |
---|
1147 | | - dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" |
---|
1148 | | - " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" |
---|
1149 | | - " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), |
---|
1150 | | - arg->cid, arg->ino, ceph_cap_string(arg->caps), |
---|
1151 | | - ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), |
---|
1152 | | - arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, |
---|
1153 | | - arg->mseq, arg->follows, arg->size, arg->max_size, |
---|
1154 | | - arg->xattr_version, |
---|
| 1245 | + dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", |
---|
| 1246 | + __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, |
---|
| 1247 | + ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), |
---|
| 1248 | + ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, |
---|
| 1249 | + arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, |
---|
| 1250 | + arg->size, arg->max_size, arg->xattr_version, |
---|
1155 | 1251 | arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); |
---|
1156 | | - |
---|
1157 | | - /* flock buffer size + inline version + inline data size + |
---|
1158 | | - * osd_epoch_barrier + oldest_flush_tid */ |
---|
1159 | | - extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4; |
---|
1160 | | - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, |
---|
1161 | | - GFP_NOFS, false); |
---|
1162 | | - if (!msg) |
---|
1163 | | - return -ENOMEM; |
---|
1164 | 1252 | |
---|
1165 | 1253 | msg->hdr.version = cpu_to_le16(10); |
---|
1166 | 1254 | msg->hdr.tid = cpu_to_le64(arg->flush_tid); |
---|
.. | .. |
---|
1226 | 1314 | /* pool namespace (version 8) (mds always ignores this) */ |
---|
1227 | 1315 | ceph_encode_32(&p, 0); |
---|
1228 | 1316 | |
---|
1229 | | - /* |
---|
1230 | | - * btime and change_attr (version 9) |
---|
1231 | | - * |
---|
1232 | | - * We just zero these out for now, as the MDS ignores them unless |
---|
1233 | | - * the requisite feature flags are set (which we don't do yet). |
---|
1234 | | - */ |
---|
1235 | | - ceph_encode_timespec64(p, &zerotime); |
---|
| 1317 | + /* btime and change_attr (version 9) */ |
---|
| 1318 | + ceph_encode_timespec64(p, &arg->btime); |
---|
1236 | 1319 | p += sizeof(struct ceph_timespec); |
---|
1237 | | - ceph_encode_64(&p, 0); |
---|
| 1320 | + ceph_encode_64(&p, arg->change_attr); |
---|
1238 | 1321 | |
---|
1239 | 1322 | /* Advisory flags (version 10) */ |
---|
1240 | 1323 | ceph_encode_32(&p, arg->flags); |
---|
1241 | | - |
---|
1242 | | - ceph_con_send(&arg->session->s_con, msg); |
---|
1243 | | - return 0; |
---|
1244 | 1324 | } |
---|
1245 | 1325 | |
---|
1246 | 1326 | /* |
---|
1247 | 1327 | * Queue cap releases when an inode is dropped from our cache. |
---|
1248 | 1328 | */ |
---|
1249 | | -void ceph_queue_caps_release(struct inode *inode) |
---|
| 1329 | +void __ceph_remove_caps(struct ceph_inode_info *ci) |
---|
1250 | 1330 | { |
---|
1251 | | - struct ceph_inode_info *ci = ceph_inode(inode); |
---|
1252 | 1331 | struct rb_node *p; |
---|
1253 | 1332 | |
---|
1254 | 1333 | /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) |
---|
.. | .. |
---|
1264 | 1343 | } |
---|
1265 | 1344 | |
---|
1266 | 1345 | /* |
---|
1267 | | - * Send a cap msg on the given inode. Update our caps state, then |
---|
1268 | | - * drop i_ceph_lock and send the message. |
---|
| 1346 | + * Prepare to send a cap message to an MDS. Update the cap state, and populate |
---|
| 1347 | + * the arg struct with the parameters that will need to be sent. This should |
---|
| 1348 | + * be done under the i_ceph_lock to guard against changes to cap state. |
---|
1269 | 1349 | * |
---|
1270 | 1350 | * Make note of max_size reported/requested from mds, revoked caps |
---|
1271 | 1351 | * that have now been implemented. |
---|
1272 | | - * |
---|
1273 | | - * Make half-hearted attempt ot to invalidate page cache if we are |
---|
1274 | | - * dropping RDCACHE. Note that this will leave behind locked pages |
---|
1275 | | - * that we'll then need to deal with elsewhere. |
---|
1276 | | - * |
---|
1277 | | - * Return non-zero if delayed release, or we experienced an error |
---|
1278 | | - * such that the caller should requeue + retry later. |
---|
1279 | | - * |
---|
1280 | | - * called with i_ceph_lock, then drops it. |
---|
1281 | | - * caller should hold snap_rwsem (read), s_mutex. |
---|
1282 | 1352 | */ |
---|
1283 | | -static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, |
---|
1284 | | - int op, bool sync, int used, int want, int retain, |
---|
1285 | | - int flushing, u64 flush_tid, u64 oldest_flush_tid) |
---|
1286 | | - __releases(cap->ci->i_ceph_lock) |
---|
| 1353 | +static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, |
---|
| 1354 | + int op, int flags, int used, int want, int retain, |
---|
| 1355 | + int flushing, u64 flush_tid, u64 oldest_flush_tid) |
---|
1287 | 1356 | { |
---|
1288 | 1357 | struct ceph_inode_info *ci = cap->ci; |
---|
1289 | 1358 | struct inode *inode = &ci->vfs_inode; |
---|
1290 | | - struct ceph_buffer *old_blob = NULL; |
---|
1291 | | - struct cap_msg_args arg; |
---|
1292 | 1359 | int held, revoking; |
---|
1293 | | - int wake = 0; |
---|
1294 | | - int delayed = 0; |
---|
1295 | | - int ret; |
---|
| 1360 | + |
---|
| 1361 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
1296 | 1362 | |
---|
1297 | 1363 | held = cap->issued | cap->implemented; |
---|
1298 | 1364 | revoking = cap->implemented & ~cap->issued; |
---|
1299 | 1365 | retain &= ~revoking; |
---|
1300 | 1366 | |
---|
1301 | | - dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", |
---|
1302 | | - inode, cap, cap->session, |
---|
| 1367 | + dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", |
---|
| 1368 | + __func__, inode, cap, cap->session, |
---|
1303 | 1369 | ceph_cap_string(held), ceph_cap_string(held & retain), |
---|
1304 | 1370 | ceph_cap_string(revoking)); |
---|
1305 | 1371 | BUG_ON((retain & CEPH_CAP_PIN) == 0); |
---|
1306 | 1372 | |
---|
1307 | | - arg.session = cap->session; |
---|
1308 | | - |
---|
1309 | | - /* don't release wanted unless we've waited a bit. */ |
---|
1310 | | - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && |
---|
1311 | | - time_before(jiffies, ci->i_hold_caps_min)) { |
---|
1312 | | - dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", |
---|
1313 | | - ceph_cap_string(cap->issued), |
---|
1314 | | - ceph_cap_string(cap->issued & retain), |
---|
1315 | | - ceph_cap_string(cap->mds_wanted), |
---|
1316 | | - ceph_cap_string(want)); |
---|
1317 | | - want |= cap->mds_wanted; |
---|
1318 | | - retain |= cap->issued; |
---|
1319 | | - delayed = 1; |
---|
1320 | | - } |
---|
1321 | | - ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); |
---|
1322 | | - if (want & ~cap->mds_wanted) { |
---|
1323 | | - /* user space may open/close single file frequently. |
---|
1324 | | - * This avoids droping mds_wanted immediately after |
---|
1325 | | - * requesting new mds_wanted. |
---|
1326 | | - */ |
---|
1327 | | - __cap_set_timeouts(mdsc, ci); |
---|
1328 | | - } |
---|
| 1373 | + ci->i_ceph_flags &= ~CEPH_I_FLUSH; |
---|
1329 | 1374 | |
---|
1330 | 1375 | cap->issued &= retain; /* drop bits we don't want */ |
---|
1331 | | - if (cap->implemented & ~cap->issued) { |
---|
1332 | | - /* |
---|
1333 | | - * Wake up any waiters on wanted -> needed transition. |
---|
1334 | | - * This is due to the weird transition from buffered |
---|
1335 | | - * to sync IO... we need to flush dirty pages _before_ |
---|
1336 | | - * allowing sync writes to avoid reordering. |
---|
1337 | | - */ |
---|
1338 | | - wake = 1; |
---|
1339 | | - } |
---|
| 1376 | + /* |
---|
| 1377 | + * Wake up any waiters on wanted -> needed transition. This is due to |
---|
| 1378 | + * the weird transition from buffered to sync IO... we need to flush |
---|
| 1379 | + * dirty pages _before_ allowing sync writes to avoid reordering. |
---|
| 1380 | + */ |
---|
| 1381 | + arg->wake = cap->implemented & ~cap->issued; |
---|
1340 | 1382 | cap->implemented &= cap->issued | used; |
---|
1341 | 1383 | cap->mds_wanted = want; |
---|
1342 | 1384 | |
---|
1343 | | - arg.ino = ceph_vino(inode).ino; |
---|
1344 | | - arg.cid = cap->cap_id; |
---|
1345 | | - arg.follows = flushing ? ci->i_head_snapc->seq : 0; |
---|
1346 | | - arg.flush_tid = flush_tid; |
---|
1347 | | - arg.oldest_flush_tid = oldest_flush_tid; |
---|
| 1385 | + arg->session = cap->session; |
---|
| 1386 | + arg->ino = ceph_vino(inode).ino; |
---|
| 1387 | + arg->cid = cap->cap_id; |
---|
| 1388 | + arg->follows = flushing ? ci->i_head_snapc->seq : 0; |
---|
| 1389 | + arg->flush_tid = flush_tid; |
---|
| 1390 | + arg->oldest_flush_tid = oldest_flush_tid; |
---|
1348 | 1391 | |
---|
1349 | | - arg.size = inode->i_size; |
---|
1350 | | - ci->i_reported_size = arg.size; |
---|
1351 | | - arg.max_size = ci->i_wanted_max_size; |
---|
1352 | | - ci->i_requested_max_size = arg.max_size; |
---|
| 1392 | + arg->size = inode->i_size; |
---|
| 1393 | + ci->i_reported_size = arg->size; |
---|
| 1394 | + arg->max_size = ci->i_wanted_max_size; |
---|
| 1395 | + if (cap == ci->i_auth_cap) { |
---|
| 1396 | + if (want & CEPH_CAP_ANY_FILE_WR) |
---|
| 1397 | + ci->i_requested_max_size = arg->max_size; |
---|
| 1398 | + else |
---|
| 1399 | + ci->i_requested_max_size = 0; |
---|
| 1400 | + } |
---|
1353 | 1401 | |
---|
1354 | 1402 | if (flushing & CEPH_CAP_XATTR_EXCL) { |
---|
1355 | | - old_blob = __ceph_build_xattrs_blob(ci); |
---|
1356 | | - arg.xattr_version = ci->i_xattrs.version; |
---|
1357 | | - arg.xattr_buf = ci->i_xattrs.blob; |
---|
| 1403 | + arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); |
---|
| 1404 | + arg->xattr_version = ci->i_xattrs.version; |
---|
| 1405 | + arg->xattr_buf = ci->i_xattrs.blob; |
---|
1358 | 1406 | } else { |
---|
1359 | | - arg.xattr_buf = NULL; |
---|
| 1407 | + arg->xattr_buf = NULL; |
---|
| 1408 | + arg->old_xattr_buf = NULL; |
---|
1360 | 1409 | } |
---|
1361 | 1410 | |
---|
1362 | | - arg.mtime = inode->i_mtime; |
---|
1363 | | - arg.atime = inode->i_atime; |
---|
1364 | | - arg.ctime = inode->i_ctime; |
---|
| 1411 | + arg->mtime = inode->i_mtime; |
---|
| 1412 | + arg->atime = inode->i_atime; |
---|
| 1413 | + arg->ctime = inode->i_ctime; |
---|
| 1414 | + arg->btime = ci->i_btime; |
---|
| 1415 | + arg->change_attr = inode_peek_iversion_raw(inode); |
---|
1365 | 1416 | |
---|
1366 | | - arg.op = op; |
---|
1367 | | - arg.caps = cap->implemented; |
---|
1368 | | - arg.wanted = want; |
---|
1369 | | - arg.dirty = flushing; |
---|
| 1417 | + arg->op = op; |
---|
| 1418 | + arg->caps = cap->implemented; |
---|
| 1419 | + arg->wanted = want; |
---|
| 1420 | + arg->dirty = flushing; |
---|
1370 | 1421 | |
---|
1371 | | - arg.seq = cap->seq; |
---|
1372 | | - arg.issue_seq = cap->issue_seq; |
---|
1373 | | - arg.mseq = cap->mseq; |
---|
1374 | | - arg.time_warp_seq = ci->i_time_warp_seq; |
---|
| 1422 | + arg->seq = cap->seq; |
---|
| 1423 | + arg->issue_seq = cap->issue_seq; |
---|
| 1424 | + arg->mseq = cap->mseq; |
---|
| 1425 | + arg->time_warp_seq = ci->i_time_warp_seq; |
---|
1375 | 1426 | |
---|
1376 | | - arg.uid = inode->i_uid; |
---|
1377 | | - arg.gid = inode->i_gid; |
---|
1378 | | - arg.mode = inode->i_mode; |
---|
| 1427 | + arg->uid = inode->i_uid; |
---|
| 1428 | + arg->gid = inode->i_gid; |
---|
| 1429 | + arg->mode = inode->i_mode; |
---|
1379 | 1430 | |
---|
1380 | | - arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; |
---|
1381 | | - if (list_empty(&ci->i_cap_snaps)) |
---|
1382 | | - arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; |
---|
1383 | | - else |
---|
1384 | | - arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; |
---|
1385 | | - if (sync) |
---|
1386 | | - arg.flags |= CEPH_CLIENT_CAPS_SYNC; |
---|
| 1431 | + arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; |
---|
| 1432 | + if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && |
---|
| 1433 | + !list_empty(&ci->i_cap_snaps)) { |
---|
| 1434 | + struct ceph_cap_snap *capsnap; |
---|
| 1435 | + list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { |
---|
| 1436 | + if (capsnap->cap_flush.tid) |
---|
| 1437 | + break; |
---|
| 1438 | + if (capsnap->need_flush) { |
---|
| 1439 | + flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; |
---|
| 1440 | + break; |
---|
| 1441 | + } |
---|
| 1442 | + } |
---|
| 1443 | + } |
---|
| 1444 | + arg->flags = flags; |
---|
| 1445 | +} |
---|
1387 | 1446 | |
---|
1388 | | - spin_unlock(&ci->i_ceph_lock); |
---|
| 1447 | +/* |
---|
| 1448 | + * Send a cap msg on the given inode. |
---|
| 1449 | + * |
---|
| 1450 | + * Caller should hold snap_rwsem (read), s_mutex. |
---|
| 1451 | + */ |
---|
| 1452 | +static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) |
---|
| 1453 | +{ |
---|
| 1454 | + struct ceph_msg *msg; |
---|
| 1455 | + struct inode *inode = &ci->vfs_inode; |
---|
1389 | 1456 | |
---|
1390 | | - ceph_buffer_put(old_blob); |
---|
1391 | | - |
---|
1392 | | - ret = send_cap_msg(&arg); |
---|
1393 | | - if (ret < 0) { |
---|
1394 | | - dout("error sending cap msg, must requeue %p\n", inode); |
---|
1395 | | - delayed = 1; |
---|
| 1457 | + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); |
---|
| 1458 | + if (!msg) { |
---|
| 1459 | + pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", |
---|
| 1460 | + ceph_vinop(inode), ceph_cap_string(arg->dirty), |
---|
| 1461 | + arg->flush_tid); |
---|
| 1462 | + spin_lock(&ci->i_ceph_lock); |
---|
| 1463 | + __cap_delay_requeue(arg->session->s_mdsc, ci); |
---|
| 1464 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 1465 | + return; |
---|
1396 | 1466 | } |
---|
1397 | 1467 | |
---|
1398 | | - if (wake) |
---|
| 1468 | + encode_cap_msg(msg, arg); |
---|
| 1469 | + ceph_con_send(&arg->session->s_con, msg); |
---|
| 1470 | + ceph_buffer_put(arg->old_xattr_buf); |
---|
| 1471 | + if (arg->wake) |
---|
1399 | 1472 | wake_up_all(&ci->i_cap_wq); |
---|
1400 | | - |
---|
1401 | | - return delayed; |
---|
1402 | 1473 | } |
---|
1403 | 1474 | |
---|
1404 | 1475 | static inline int __send_flush_snap(struct inode *inode, |
---|
.. | .. |
---|
1407 | 1478 | u32 mseq, u64 oldest_flush_tid) |
---|
1408 | 1479 | { |
---|
1409 | 1480 | struct cap_msg_args arg; |
---|
| 1481 | + struct ceph_msg *msg; |
---|
| 1482 | + |
---|
| 1483 | + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); |
---|
| 1484 | + if (!msg) |
---|
| 1485 | + return -ENOMEM; |
---|
1410 | 1486 | |
---|
1411 | 1487 | arg.session = session; |
---|
1412 | 1488 | arg.ino = ceph_vino(inode).ino; |
---|
.. | .. |
---|
1419 | 1495 | arg.max_size = 0; |
---|
1420 | 1496 | arg.xattr_version = capsnap->xattr_version; |
---|
1421 | 1497 | arg.xattr_buf = capsnap->xattr_blob; |
---|
| 1498 | + arg.old_xattr_buf = NULL; |
---|
1422 | 1499 | |
---|
1423 | 1500 | arg.atime = capsnap->atime; |
---|
1424 | 1501 | arg.mtime = capsnap->mtime; |
---|
1425 | 1502 | arg.ctime = capsnap->ctime; |
---|
| 1503 | + arg.btime = capsnap->btime; |
---|
| 1504 | + arg.change_attr = capsnap->change_attr; |
---|
1426 | 1505 | |
---|
1427 | 1506 | arg.op = CEPH_CAP_OP_FLUSHSNAP; |
---|
1428 | 1507 | arg.caps = capsnap->issued; |
---|
.. | .. |
---|
1440 | 1519 | |
---|
1441 | 1520 | arg.inline_data = capsnap->inline_data; |
---|
1442 | 1521 | arg.flags = 0; |
---|
| 1522 | + arg.wake = false; |
---|
1443 | 1523 | |
---|
1444 | | - return send_cap_msg(&arg); |
---|
| 1524 | + encode_cap_msg(msg, &arg); |
---|
| 1525 | + ceph_con_send(&arg.session->s_con, msg); |
---|
| 1526 | + return 0; |
---|
1445 | 1527 | } |
---|
1446 | 1528 | |
---|
1447 | 1529 | /* |
---|
.. | .. |
---|
1554 | 1636 | struct inode *inode = &ci->vfs_inode; |
---|
1555 | 1637 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
---|
1556 | 1638 | struct ceph_mds_session *session = NULL; |
---|
| 1639 | + bool need_put = false; |
---|
1557 | 1640 | int mds; |
---|
1558 | 1641 | |
---|
1559 | 1642 | dout("ceph_flush_snaps %p\n", inode); |
---|
.. | .. |
---|
1590 | 1673 | } |
---|
1591 | 1674 | |
---|
1592 | 1675 | // make sure flushsnap messages are sent in proper order. |
---|
1593 | | - if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { |
---|
| 1676 | + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) |
---|
1594 | 1677 | __kick_flushing_caps(mdsc, session, ci, 0); |
---|
1595 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
---|
1596 | | - } |
---|
1597 | 1678 | |
---|
1598 | 1679 | __ceph_flush_snaps(ci, session); |
---|
1599 | 1680 | out: |
---|
.. | .. |
---|
1607 | 1688 | } |
---|
1608 | 1689 | /* we flushed them all; remove this inode from the queue */ |
---|
1609 | 1690 | spin_lock(&mdsc->snap_flush_lock); |
---|
| 1691 | + if (!list_empty(&ci->i_snap_flush_item)) |
---|
| 1692 | + need_put = true; |
---|
1610 | 1693 | list_del_init(&ci->i_snap_flush_item); |
---|
1611 | 1694 | spin_unlock(&mdsc->snap_flush_lock); |
---|
| 1695 | + |
---|
| 1696 | + if (need_put) |
---|
| 1697 | + iput(inode); |
---|
1612 | 1698 | } |
---|
1613 | 1699 | |
---|
1614 | 1700 | /* |
---|
.. | .. |
---|
1625 | 1711 | int was = ci->i_dirty_caps; |
---|
1626 | 1712 | int dirty = 0; |
---|
1627 | 1713 | |
---|
| 1714 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
| 1715 | + |
---|
1628 | 1716 | if (!ci->i_auth_cap) { |
---|
1629 | 1717 | pr_warn("__mark_dirty_caps %p %llx mask %s, " |
---|
1630 | 1718 | "but no auth cap (session was closed?)\n", |
---|
.. | .. |
---|
1637 | 1725 | ceph_cap_string(was | mask)); |
---|
1638 | 1726 | ci->i_dirty_caps |= mask; |
---|
1639 | 1727 | if (was == 0) { |
---|
| 1728 | + struct ceph_mds_session *session = ci->i_auth_cap->session; |
---|
| 1729 | + |
---|
1640 | 1730 | WARN_ON_ONCE(ci->i_prealloc_cap_flush); |
---|
1641 | 1731 | swap(ci->i_prealloc_cap_flush, *pcf); |
---|
1642 | 1732 | |
---|
.. | .. |
---|
1649 | 1739 | &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); |
---|
1650 | 1740 | BUG_ON(!list_empty(&ci->i_dirty_item)); |
---|
1651 | 1741 | spin_lock(&mdsc->cap_dirty_lock); |
---|
1652 | | - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); |
---|
| 1742 | + list_add(&ci->i_dirty_item, &session->s_cap_dirty); |
---|
1653 | 1743 | spin_unlock(&mdsc->cap_dirty_lock); |
---|
1654 | 1744 | if (ci->i_flushing_caps == 0) { |
---|
1655 | 1745 | ihold(inode); |
---|
.. | .. |
---|
1668 | 1758 | |
---|
1669 | 1759 | struct ceph_cap_flush *ceph_alloc_cap_flush(void) |
---|
1670 | 1760 | { |
---|
1671 | | - return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); |
---|
| 1761 | + struct ceph_cap_flush *cf; |
---|
| 1762 | + |
---|
| 1763 | + cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); |
---|
| 1764 | + if (!cf) |
---|
| 1765 | + return NULL; |
---|
| 1766 | + |
---|
| 1767 | + cf->is_capsnap = false; |
---|
| 1768 | + return cf; |
---|
1672 | 1769 | } |
---|
1673 | 1770 | |
---|
1674 | 1771 | void ceph_free_cap_flush(struct ceph_cap_flush *cf) |
---|
.. | .. |
---|
1692 | 1789 | * Remove cap_flush from the mdsc's or inode's flushing cap list. |
---|
1693 | 1790 | * Return true if caller needs to wake up flush waiters. |
---|
1694 | 1791 | */ |
---|
1695 | | -static bool __finish_cap_flush(struct ceph_mds_client *mdsc, |
---|
1696 | | - struct ceph_inode_info *ci, |
---|
1697 | | - struct ceph_cap_flush *cf) |
---|
| 1792 | +static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, |
---|
| 1793 | + struct ceph_cap_flush *cf) |
---|
1698 | 1794 | { |
---|
1699 | 1795 | struct ceph_cap_flush *prev; |
---|
1700 | 1796 | bool wake = cf->wake; |
---|
1701 | | - if (mdsc) { |
---|
1702 | | - /* are there older pending cap flushes? */ |
---|
1703 | | - if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { |
---|
1704 | | - prev = list_prev_entry(cf, g_list); |
---|
1705 | | - prev->wake = true; |
---|
1706 | | - wake = false; |
---|
1707 | | - } |
---|
1708 | | - list_del(&cf->g_list); |
---|
1709 | | - } else if (ci) { |
---|
1710 | | - if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { |
---|
1711 | | - prev = list_prev_entry(cf, i_list); |
---|
1712 | | - prev->wake = true; |
---|
1713 | | - wake = false; |
---|
1714 | | - } |
---|
1715 | | - list_del(&cf->i_list); |
---|
1716 | | - } else { |
---|
1717 | | - BUG_ON(1); |
---|
| 1797 | + |
---|
| 1798 | + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { |
---|
| 1799 | + prev = list_prev_entry(cf, g_list); |
---|
| 1800 | + prev->wake = true; |
---|
| 1801 | + wake = false; |
---|
1718 | 1802 | } |
---|
| 1803 | + list_del_init(&cf->g_list); |
---|
| 1804 | + return wake; |
---|
| 1805 | +} |
---|
| 1806 | + |
---|
| 1807 | +static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, |
---|
| 1808 | + struct ceph_cap_flush *cf) |
---|
| 1809 | +{ |
---|
| 1810 | + struct ceph_cap_flush *prev; |
---|
| 1811 | + bool wake = cf->wake; |
---|
| 1812 | + |
---|
| 1813 | + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { |
---|
| 1814 | + prev = list_prev_entry(cf, i_list); |
---|
| 1815 | + prev->wake = true; |
---|
| 1816 | + wake = false; |
---|
| 1817 | + } |
---|
| 1818 | + list_del_init(&cf->i_list); |
---|
1719 | 1819 | return wake; |
---|
1720 | 1820 | } |
---|
1721 | 1821 | |
---|
.. | .. |
---|
1723 | 1823 | * Add dirty inode to the flushing list. Assigned a seq number so we |
---|
1724 | 1824 | * can wait for caps to flush without starving. |
---|
1725 | 1825 | * |
---|
1726 | | - * Called under i_ceph_lock. |
---|
| 1826 | + * Called under i_ceph_lock. Returns the flush tid. |
---|
1727 | 1827 | */ |
---|
1728 | | -static int __mark_caps_flushing(struct inode *inode, |
---|
| 1828 | +static u64 __mark_caps_flushing(struct inode *inode, |
---|
1729 | 1829 | struct ceph_mds_session *session, bool wake, |
---|
1730 | | - u64 *flush_tid, u64 *oldest_flush_tid) |
---|
| 1830 | + u64 *oldest_flush_tid) |
---|
1731 | 1831 | { |
---|
1732 | 1832 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
---|
1733 | 1833 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
1734 | 1834 | struct ceph_cap_flush *cf = NULL; |
---|
1735 | 1835 | int flushing; |
---|
1736 | 1836 | |
---|
| 1837 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
1737 | 1838 | BUG_ON(ci->i_dirty_caps == 0); |
---|
1738 | 1839 | BUG_ON(list_empty(&ci->i_dirty_item)); |
---|
1739 | 1840 | BUG_ON(!ci->i_prealloc_cap_flush); |
---|
.. | .. |
---|
1766 | 1867 | |
---|
1767 | 1868 | list_add_tail(&cf->i_list, &ci->i_cap_flush_list); |
---|
1768 | 1869 | |
---|
1769 | | - *flush_tid = cf->tid; |
---|
1770 | | - return flushing; |
---|
| 1870 | + return cf->tid; |
---|
1771 | 1871 | } |
---|
1772 | 1872 | |
---|
1773 | 1873 | /* |
---|
.. | .. |
---|
1817 | 1917 | * versus held caps. Release, flush, ack revoked caps to mds as |
---|
1818 | 1918 | * appropriate. |
---|
1819 | 1919 | * |
---|
1820 | | - * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay |
---|
1821 | | - * cap release further. |
---|
1822 | 1920 | * CHECK_CAPS_AUTHONLY - we should only check the auth cap |
---|
1823 | 1921 | * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without |
---|
1824 | 1922 | * further delay. |
---|
.. | .. |
---|
1826 | 1924 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, |
---|
1827 | 1925 | struct ceph_mds_session *session) |
---|
1828 | 1926 | { |
---|
1829 | | - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); |
---|
1830 | | - struct ceph_mds_client *mdsc = fsc->mdsc; |
---|
1831 | 1927 | struct inode *inode = &ci->vfs_inode; |
---|
| 1928 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); |
---|
1832 | 1929 | struct ceph_cap *cap; |
---|
1833 | 1930 | u64 flush_tid, oldest_flush_tid; |
---|
1834 | 1931 | int file_wanted, used, cap_used; |
---|
.. | .. |
---|
1837 | 1934 | int mds = -1; /* keep track of how far we've gone through i_caps list |
---|
1838 | 1935 | to avoid an infinite loop on retry */ |
---|
1839 | 1936 | struct rb_node *p; |
---|
1840 | | - int delayed = 0, sent = 0; |
---|
1841 | | - bool no_delay = flags & CHECK_CAPS_NODELAY; |
---|
1842 | 1937 | bool queue_invalidate = false; |
---|
1843 | 1938 | bool tried_invalidate = false; |
---|
1844 | 1939 | |
---|
1845 | | - /* if we are unmounting, flush any unused caps immediately. */ |
---|
1846 | | - if (mdsc->stopping) |
---|
1847 | | - no_delay = true; |
---|
1848 | | - |
---|
1849 | 1940 | spin_lock(&ci->i_ceph_lock); |
---|
1850 | | - |
---|
1851 | 1941 | if (ci->i_ceph_flags & CEPH_I_FLUSH) |
---|
1852 | 1942 | flags |= CHECK_CAPS_FLUSH; |
---|
1853 | | - |
---|
1854 | | - if (!(flags & CHECK_CAPS_AUTHONLY) || |
---|
1855 | | - (ci->i_auth_cap && __ceph_is_single_caps(ci))) |
---|
1856 | | - __cap_delay_cancel(mdsc, ci); |
---|
1857 | 1943 | |
---|
1858 | 1944 | goto retry_locked; |
---|
1859 | 1945 | retry: |
---|
1860 | 1946 | spin_lock(&ci->i_ceph_lock); |
---|
1861 | 1947 | retry_locked: |
---|
| 1948 | + /* Caps wanted by virtue of active open files. */ |
---|
1862 | 1949 | file_wanted = __ceph_caps_file_wanted(ci); |
---|
| 1950 | + |
---|
| 1951 | + /* Caps which have active references against them */ |
---|
1863 | 1952 | used = __ceph_caps_used(ci); |
---|
| 1953 | + |
---|
| 1954 | + /* |
---|
| 1955 | + * "issued" represents the current caps that the MDS wants us to have. |
---|
| 1956 | + * "implemented" is the set that we have been granted, and includes the |
---|
| 1957 | + * ones that have not yet been returned to the MDS (the "revoking" set, |
---|
| 1958 | + * usually because they have outstanding references). |
---|
| 1959 | + */ |
---|
1864 | 1960 | issued = __ceph_caps_issued(ci, &implemented); |
---|
1865 | 1961 | revoking = implemented & ~issued; |
---|
1866 | 1962 | |
---|
1867 | 1963 | want = file_wanted; |
---|
| 1964 | + |
---|
| 1965 | + /* The ones we currently want to retain (may be adjusted below) */ |
---|
1868 | 1966 | retain = file_wanted | used | CEPH_CAP_PIN; |
---|
1869 | 1967 | if (!mdsc->stopping && inode->i_nlink > 0) { |
---|
1870 | 1968 | if (file_wanted) { |
---|
1871 | 1969 | retain |= CEPH_CAP_ANY; /* be greedy */ |
---|
1872 | 1970 | } else if (S_ISDIR(inode->i_mode) && |
---|
1873 | 1971 | (issued & CEPH_CAP_FILE_SHARED) && |
---|
1874 | | - __ceph_dir_is_complete(ci)) { |
---|
| 1972 | + __ceph_dir_is_complete(ci)) { |
---|
1875 | 1973 | /* |
---|
1876 | 1974 | * If a directory is complete, we want to keep |
---|
1877 | 1975 | * the exclusive cap. So that MDS does not end up |
---|
1878 | 1976 | * revoking the shared cap on every create/unlink |
---|
1879 | 1977 | * operation. |
---|
1880 | 1978 | */ |
---|
1881 | | - want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; |
---|
| 1979 | + if (IS_RDONLY(inode)) { |
---|
| 1980 | + want = CEPH_CAP_ANY_SHARED; |
---|
| 1981 | + } else { |
---|
| 1982 | + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; |
---|
| 1983 | + } |
---|
1882 | 1984 | retain |= want; |
---|
1883 | 1985 | } else { |
---|
1884 | 1986 | |
---|
.. | .. |
---|
1894 | 1996 | } |
---|
1895 | 1997 | |
---|
1896 | 1998 | dout("check_caps %p file_want %s used %s dirty %s flushing %s" |
---|
1897 | | - " issued %s revoking %s retain %s %s%s%s\n", inode, |
---|
| 1999 | + " issued %s revoking %s retain %s %s%s\n", inode, |
---|
1898 | 2000 | ceph_cap_string(file_wanted), |
---|
1899 | 2001 | ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), |
---|
1900 | 2002 | ceph_cap_string(ci->i_flushing_caps), |
---|
1901 | 2003 | ceph_cap_string(issued), ceph_cap_string(revoking), |
---|
1902 | 2004 | ceph_cap_string(retain), |
---|
1903 | 2005 | (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", |
---|
1904 | | - (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", |
---|
1905 | 2006 | (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); |
---|
1906 | 2007 | |
---|
1907 | 2008 | /* |
---|
.. | .. |
---|
1909 | 2010 | * have cached pages, but don't want them, then try to invalidate. |
---|
1910 | 2011 | * If we fail, it's because pages are locked.... try again later. |
---|
1911 | 2012 | */ |
---|
1912 | | - if ((!no_delay || mdsc->stopping) && |
---|
1913 | | - !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ |
---|
| 2013 | + if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && |
---|
| 2014 | + S_ISREG(inode->i_mode) && |
---|
1914 | 2015 | !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ |
---|
1915 | 2016 | inode->i_data.nrpages && /* have cached pages */ |
---|
1916 | 2017 | (revoking & (CEPH_CAP_FILE_CACHE| |
---|
.. | .. |
---|
1927 | 2028 | } |
---|
1928 | 2029 | |
---|
1929 | 2030 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
---|
| 2031 | + int mflags = 0; |
---|
| 2032 | + struct cap_msg_args arg; |
---|
| 2033 | + |
---|
1930 | 2034 | cap = rb_entry(p, struct ceph_cap, ci_node); |
---|
1931 | 2035 | |
---|
1932 | 2036 | /* avoid looping forever */ |
---|
.. | .. |
---|
1936 | 2040 | |
---|
1937 | 2041 | /* NOTE: no side-effects allowed, until we take s_mutex */ |
---|
1938 | 2042 | |
---|
| 2043 | + /* |
---|
| 2044 | + * If we have an auth cap, we don't need to consider any |
---|
| 2045 | + * overlapping caps as used. |
---|
| 2046 | + */ |
---|
1939 | 2047 | cap_used = used; |
---|
1940 | 2048 | if (ci->i_auth_cap && cap != ci->i_auth_cap) |
---|
1941 | 2049 | cap_used &= ~ci->i_auth_cap->issued; |
---|
.. | .. |
---|
1990 | 2098 | } |
---|
1991 | 2099 | |
---|
1992 | 2100 | /* things we might delay */ |
---|
1993 | | - if ((cap->issued & ~retain) == 0 && |
---|
1994 | | - cap->mds_wanted == want) |
---|
| 2101 | + if ((cap->issued & ~retain) == 0) |
---|
1995 | 2102 | continue; /* nope, all good */ |
---|
1996 | 2103 | |
---|
1997 | | - if (no_delay) |
---|
1998 | | - goto ack; |
---|
1999 | | - |
---|
2000 | | - /* delay? */ |
---|
2001 | | - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && |
---|
2002 | | - time_before(jiffies, ci->i_hold_caps_max)) { |
---|
2003 | | - dout(" delaying issued %s -> %s, wanted %s -> %s\n", |
---|
2004 | | - ceph_cap_string(cap->issued), |
---|
2005 | | - ceph_cap_string(cap->issued & retain), |
---|
2006 | | - ceph_cap_string(cap->mds_wanted), |
---|
2007 | | - ceph_cap_string(want)); |
---|
2008 | | - delayed++; |
---|
2009 | | - continue; |
---|
2010 | | - } |
---|
2011 | | - |
---|
2012 | 2104 | ack: |
---|
2013 | | - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { |
---|
2014 | | - dout(" skipping %p I_NOFLUSH set\n", inode); |
---|
2015 | | - continue; |
---|
2016 | | - } |
---|
2017 | | - |
---|
2018 | 2105 | if (session && session != cap->session) { |
---|
2019 | 2106 | dout("oops, wrong session %p mutex\n", session); |
---|
2020 | 2107 | mutex_unlock(&session->s_mutex); |
---|
.. | .. |
---|
2052 | 2139 | if (cap == ci->i_auth_cap && |
---|
2053 | 2140 | (ci->i_ceph_flags & |
---|
2054 | 2141 | (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { |
---|
2055 | | - if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { |
---|
| 2142 | + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) |
---|
2056 | 2143 | __kick_flushing_caps(mdsc, session, ci, 0); |
---|
2057 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
---|
2058 | | - } |
---|
2059 | 2144 | if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) |
---|
2060 | 2145 | __ceph_flush_snaps(ci, session); |
---|
2061 | 2146 | |
---|
.. | .. |
---|
2076 | 2161 | } |
---|
2077 | 2162 | |
---|
2078 | 2163 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) { |
---|
2079 | | - flushing = __mark_caps_flushing(inode, session, false, |
---|
2080 | | - &flush_tid, |
---|
2081 | | - &oldest_flush_tid); |
---|
| 2164 | + flushing = ci->i_dirty_caps; |
---|
| 2165 | + flush_tid = __mark_caps_flushing(inode, session, false, |
---|
| 2166 | + &oldest_flush_tid); |
---|
| 2167 | + if (flags & CHECK_CAPS_FLUSH && |
---|
| 2168 | + list_empty(&session->s_cap_dirty)) |
---|
| 2169 | + mflags |= CEPH_CLIENT_CAPS_SYNC; |
---|
2082 | 2170 | } else { |
---|
2083 | 2171 | flushing = 0; |
---|
2084 | 2172 | flush_tid = 0; |
---|
.. | .. |
---|
2088 | 2176 | } |
---|
2089 | 2177 | |
---|
2090 | 2178 | mds = cap->mds; /* remember mds, so we don't repeat */ |
---|
2091 | | - sent++; |
---|
2092 | 2179 | |
---|
2093 | | - /* __send_cap drops i_ceph_lock */ |
---|
2094 | | - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false, |
---|
2095 | | - cap_used, want, retain, flushing, |
---|
2096 | | - flush_tid, oldest_flush_tid); |
---|
| 2180 | + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, |
---|
| 2181 | + want, retain, flushing, flush_tid, oldest_flush_tid); |
---|
| 2182 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 2183 | + |
---|
| 2184 | + __send_cap(&arg, ci); |
---|
| 2185 | + |
---|
2097 | 2186 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ |
---|
2098 | 2187 | } |
---|
2099 | 2188 | |
---|
2100 | | - /* Reschedule delayed caps release if we delayed anything */ |
---|
2101 | | - if (delayed) |
---|
| 2189 | + /* periodically re-calculate caps wanted by open files */ |
---|
| 2190 | + if (__ceph_is_any_real_caps(ci) && |
---|
| 2191 | + list_empty(&ci->i_cap_delay_list) && |
---|
| 2192 | + (file_wanted & ~CEPH_CAP_PIN) && |
---|
| 2193 | + !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { |
---|
2102 | 2194 | __cap_delay_requeue(mdsc, ci); |
---|
| 2195 | + } |
---|
2103 | 2196 | |
---|
2104 | 2197 | spin_unlock(&ci->i_ceph_lock); |
---|
2105 | 2198 | |
---|
.. | .. |
---|
2125 | 2218 | |
---|
2126 | 2219 | retry: |
---|
2127 | 2220 | spin_lock(&ci->i_ceph_lock); |
---|
2128 | | - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { |
---|
2129 | | - spin_unlock(&ci->i_ceph_lock); |
---|
2130 | | - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); |
---|
2131 | | - goto out; |
---|
2132 | | - } |
---|
| 2221 | +retry_locked: |
---|
2133 | 2222 | if (ci->i_dirty_caps && ci->i_auth_cap) { |
---|
2134 | 2223 | struct ceph_cap *cap = ci->i_auth_cap; |
---|
2135 | | - int used = __ceph_caps_used(ci); |
---|
2136 | | - int want = __ceph_caps_wanted(ci); |
---|
2137 | | - int delayed; |
---|
| 2224 | + struct cap_msg_args arg; |
---|
2138 | 2225 | |
---|
2139 | | - if (!session || session != cap->session) { |
---|
| 2226 | + if (session != cap->session) { |
---|
2140 | 2227 | spin_unlock(&ci->i_ceph_lock); |
---|
2141 | 2228 | if (session) |
---|
2142 | 2229 | mutex_unlock(&session->s_mutex); |
---|
.. | .. |
---|
2149 | 2236 | goto out; |
---|
2150 | 2237 | } |
---|
2151 | 2238 | |
---|
2152 | | - flushing = __mark_caps_flushing(inode, session, true, |
---|
2153 | | - &flush_tid, &oldest_flush_tid); |
---|
2154 | | - |
---|
2155 | | - /* __send_cap drops i_ceph_lock */ |
---|
2156 | | - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true, |
---|
2157 | | - used, want, (cap->issued | cap->implemented), |
---|
2158 | | - flushing, flush_tid, oldest_flush_tid); |
---|
2159 | | - |
---|
2160 | | - if (delayed) { |
---|
2161 | | - spin_lock(&ci->i_ceph_lock); |
---|
2162 | | - __cap_delay_requeue(mdsc, ci); |
---|
2163 | | - spin_unlock(&ci->i_ceph_lock); |
---|
| 2239 | + if (ci->i_ceph_flags & |
---|
| 2240 | + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { |
---|
| 2241 | + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) |
---|
| 2242 | + __kick_flushing_caps(mdsc, session, ci, 0); |
---|
| 2243 | + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) |
---|
| 2244 | + __ceph_flush_snaps(ci, session); |
---|
| 2245 | + goto retry_locked; |
---|
2164 | 2246 | } |
---|
| 2247 | + |
---|
| 2248 | + flushing = ci->i_dirty_caps; |
---|
| 2249 | + flush_tid = __mark_caps_flushing(inode, session, true, |
---|
| 2250 | + &oldest_flush_tid); |
---|
| 2251 | + |
---|
| 2252 | + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, |
---|
| 2253 | + __ceph_caps_used(ci), __ceph_caps_wanted(ci), |
---|
| 2254 | + (cap->issued | cap->implemented), |
---|
| 2255 | + flushing, flush_tid, oldest_flush_tid); |
---|
| 2256 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 2257 | + |
---|
| 2258 | + __send_cap(&arg, ci); |
---|
2165 | 2259 | } else { |
---|
2166 | 2260 | if (!list_empty(&ci->i_cap_flush_list)) { |
---|
2167 | 2261 | struct ceph_cap_flush *cf = |
---|
.. | .. |
---|
2206 | 2300 | */ |
---|
2207 | 2301 | static int unsafe_request_wait(struct inode *inode) |
---|
2208 | 2302 | { |
---|
| 2303 | + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
---|
2209 | 2304 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
2210 | 2305 | struct ceph_mds_request *req1 = NULL, *req2 = NULL; |
---|
2211 | 2306 | int ret, err = 0; |
---|
.. | .. |
---|
2225 | 2320 | } |
---|
2226 | 2321 | spin_unlock(&ci->i_unsafe_lock); |
---|
2227 | 2322 | |
---|
| 2323 | + /* |
---|
| 2324 | + * Trigger to flush the journal logs in all the relevant MDSes |
---|
| 2325 | + * manually, or in the worst case we must wait at most 5 seconds |
---|
| 2326 | + * to wait the journal logs to be flushed by the MDSes periodically. |
---|
| 2327 | + */ |
---|
| 2328 | + if (req1 || req2) { |
---|
| 2329 | + struct ceph_mds_request *req; |
---|
| 2330 | + struct ceph_mds_session **sessions; |
---|
| 2331 | + struct ceph_mds_session *s; |
---|
| 2332 | + unsigned int max_sessions; |
---|
| 2333 | + int i; |
---|
| 2334 | + |
---|
| 2335 | + mutex_lock(&mdsc->mutex); |
---|
| 2336 | + max_sessions = mdsc->max_sessions; |
---|
| 2337 | + |
---|
| 2338 | + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); |
---|
| 2339 | + if (!sessions) { |
---|
| 2340 | + mutex_unlock(&mdsc->mutex); |
---|
| 2341 | + err = -ENOMEM; |
---|
| 2342 | + goto out; |
---|
| 2343 | + } |
---|
| 2344 | + |
---|
| 2345 | + spin_lock(&ci->i_unsafe_lock); |
---|
| 2346 | + if (req1) { |
---|
| 2347 | + list_for_each_entry(req, &ci->i_unsafe_dirops, |
---|
| 2348 | + r_unsafe_dir_item) { |
---|
| 2349 | + s = req->r_session; |
---|
| 2350 | + if (!s) |
---|
| 2351 | + continue; |
---|
| 2352 | + if (!sessions[s->s_mds]) { |
---|
| 2353 | + s = ceph_get_mds_session(s); |
---|
| 2354 | + sessions[s->s_mds] = s; |
---|
| 2355 | + } |
---|
| 2356 | + } |
---|
| 2357 | + } |
---|
| 2358 | + if (req2) { |
---|
| 2359 | + list_for_each_entry(req, &ci->i_unsafe_iops, |
---|
| 2360 | + r_unsafe_target_item) { |
---|
| 2361 | + s = req->r_session; |
---|
| 2362 | + if (!s) |
---|
| 2363 | + continue; |
---|
| 2364 | + if (!sessions[s->s_mds]) { |
---|
| 2365 | + s = ceph_get_mds_session(s); |
---|
| 2366 | + sessions[s->s_mds] = s; |
---|
| 2367 | + } |
---|
| 2368 | + } |
---|
| 2369 | + } |
---|
| 2370 | + spin_unlock(&ci->i_unsafe_lock); |
---|
| 2371 | + |
---|
| 2372 | + /* the auth MDS */ |
---|
| 2373 | + spin_lock(&ci->i_ceph_lock); |
---|
| 2374 | + if (ci->i_auth_cap) { |
---|
| 2375 | + s = ci->i_auth_cap->session; |
---|
| 2376 | + if (!sessions[s->s_mds]) |
---|
| 2377 | + sessions[s->s_mds] = ceph_get_mds_session(s); |
---|
| 2378 | + } |
---|
| 2379 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 2380 | + mutex_unlock(&mdsc->mutex); |
---|
| 2381 | + |
---|
| 2382 | + /* send flush mdlog request to MDSes */ |
---|
| 2383 | + for (i = 0; i < max_sessions; i++) { |
---|
| 2384 | + s = sessions[i]; |
---|
| 2385 | + if (s) { |
---|
| 2386 | + send_flush_mdlog(s); |
---|
| 2387 | + ceph_put_mds_session(s); |
---|
| 2388 | + } |
---|
| 2389 | + } |
---|
| 2390 | + kfree(sessions); |
---|
| 2391 | + } |
---|
| 2392 | + |
---|
2228 | 2393 | dout("unsafe_request_wait %p wait on tid %llu %llu\n", |
---|
2229 | 2394 | inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); |
---|
2230 | 2395 | if (req1) { |
---|
.. | .. |
---|
2232 | 2397 | ceph_timeout_jiffies(req1->r_timeout)); |
---|
2233 | 2398 | if (ret) |
---|
2234 | 2399 | err = -EIO; |
---|
2235 | | - ceph_mdsc_put_request(req1); |
---|
2236 | 2400 | } |
---|
2237 | 2401 | if (req2) { |
---|
2238 | 2402 | ret = !wait_for_completion_timeout(&req2->r_safe_completion, |
---|
2239 | 2403 | ceph_timeout_jiffies(req2->r_timeout)); |
---|
2240 | 2404 | if (ret) |
---|
2241 | 2405 | err = -EIO; |
---|
2242 | | - ceph_mdsc_put_request(req2); |
---|
2243 | 2406 | } |
---|
| 2407 | + |
---|
| 2408 | +out: |
---|
| 2409 | + if (req1) |
---|
| 2410 | + ceph_mdsc_put_request(req1); |
---|
| 2411 | + if (req2) |
---|
| 2412 | + ceph_mdsc_put_request(req2); |
---|
2244 | 2413 | return err; |
---|
2245 | 2414 | } |
---|
2246 | 2415 | |
---|
.. | .. |
---|
2249 | 2418 | struct inode *inode = file->f_mapping->host; |
---|
2250 | 2419 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
2251 | 2420 | u64 flush_tid; |
---|
2252 | | - int ret; |
---|
| 2421 | + int ret, err; |
---|
2253 | 2422 | int dirty; |
---|
2254 | 2423 | |
---|
2255 | 2424 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); |
---|
2256 | 2425 | |
---|
2257 | 2426 | ret = file_write_and_wait_range(file, start, end); |
---|
2258 | | - if (ret < 0) |
---|
2259 | | - goto out; |
---|
2260 | | - |
---|
2261 | 2427 | if (datasync) |
---|
2262 | 2428 | goto out; |
---|
2263 | 2429 | |
---|
2264 | | - inode_lock(inode); |
---|
| 2430 | + ret = ceph_wait_on_async_create(inode); |
---|
| 2431 | + if (ret) |
---|
| 2432 | + goto out; |
---|
2265 | 2433 | |
---|
2266 | 2434 | dirty = try_flush_caps(inode, &flush_tid); |
---|
2267 | 2435 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); |
---|
2268 | 2436 | |
---|
2269 | | - ret = unsafe_request_wait(inode); |
---|
| 2437 | + err = unsafe_request_wait(inode); |
---|
2270 | 2438 | |
---|
2271 | 2439 | /* |
---|
2272 | 2440 | * only wait on non-file metadata writeback (the mds |
---|
2273 | 2441 | * can recover size and mtime, so we don't need to |
---|
2274 | 2442 | * wait for that) |
---|
2275 | 2443 | */ |
---|
2276 | | - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { |
---|
2277 | | - ret = wait_event_interruptible(ci->i_cap_wq, |
---|
| 2444 | + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { |
---|
| 2445 | + err = wait_event_interruptible(ci->i_cap_wq, |
---|
2278 | 2446 | caps_are_flushed(inode, flush_tid)); |
---|
2279 | 2447 | } |
---|
2280 | | - inode_unlock(inode); |
---|
| 2448 | + |
---|
| 2449 | + if (err < 0) |
---|
| 2450 | + ret = err; |
---|
| 2451 | + |
---|
| 2452 | + err = file_check_and_advance_wb_err(file); |
---|
| 2453 | + if (err < 0) |
---|
| 2454 | + ret = err; |
---|
2281 | 2455 | out: |
---|
2282 | 2456 | dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); |
---|
2283 | 2457 | return ret; |
---|
.. | .. |
---|
2327 | 2501 | struct ceph_cap_flush *cf; |
---|
2328 | 2502 | int ret; |
---|
2329 | 2503 | u64 first_tid = 0; |
---|
| 2504 | + u64 last_snap_flush = 0; |
---|
| 2505 | + |
---|
| 2506 | + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
---|
| 2507 | + |
---|
| 2508 | + list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { |
---|
| 2509 | + if (cf->is_capsnap) { |
---|
| 2510 | + last_snap_flush = cf->tid; |
---|
| 2511 | + break; |
---|
| 2512 | + } |
---|
| 2513 | + } |
---|
2330 | 2514 | |
---|
2331 | 2515 | list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { |
---|
2332 | 2516 | if (cf->tid < first_tid) |
---|
.. | .. |
---|
2341 | 2525 | |
---|
2342 | 2526 | first_tid = cf->tid + 1; |
---|
2343 | 2527 | |
---|
2344 | | - if (cf->caps) { |
---|
| 2528 | + if (!cf->is_capsnap) { |
---|
| 2529 | + struct cap_msg_args arg; |
---|
| 2530 | + |
---|
2345 | 2531 | dout("kick_flushing_caps %p cap %p tid %llu %s\n", |
---|
2346 | 2532 | inode, cap, cf->tid, ceph_cap_string(cf->caps)); |
---|
2347 | | - ci->i_ceph_flags |= CEPH_I_NODELAY; |
---|
2348 | | - ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, |
---|
2349 | | - false, __ceph_caps_used(ci), |
---|
| 2533 | + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, |
---|
| 2534 | + (cf->tid < last_snap_flush ? |
---|
| 2535 | + CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), |
---|
| 2536 | + __ceph_caps_used(ci), |
---|
2350 | 2537 | __ceph_caps_wanted(ci), |
---|
2351 | | - cap->issued | cap->implemented, |
---|
| 2538 | + (cap->issued | cap->implemented), |
---|
2352 | 2539 | cf->caps, cf->tid, oldest_flush_tid); |
---|
2353 | | - if (ret) { |
---|
2354 | | - pr_err("kick_flushing_caps: error sending " |
---|
2355 | | - "cap flush, ino (%llx.%llx) " |
---|
2356 | | - "tid %llu flushing %s\n", |
---|
2357 | | - ceph_vinop(inode), cf->tid, |
---|
2358 | | - ceph_cap_string(cf->caps)); |
---|
2359 | | - } |
---|
| 2540 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 2541 | + __send_cap(&arg, ci); |
---|
2360 | 2542 | } else { |
---|
2361 | 2543 | struct ceph_cap_snap *capsnap = |
---|
2362 | 2544 | container_of(cf, struct ceph_cap_snap, |
---|
.. | .. |
---|
2417 | 2599 | */ |
---|
2418 | 2600 | if ((cap->issued & ci->i_flushing_caps) != |
---|
2419 | 2601 | ci->i_flushing_caps) { |
---|
2420 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
---|
| 2602 | + /* encode_caps_cb() also will reset these sequence |
---|
| 2603 | + * numbers. make sure sequence numbers in cap flush |
---|
| 2604 | + * message match later reconnect message */ |
---|
| 2605 | + cap->seq = 0; |
---|
| 2606 | + cap->issue_seq = 0; |
---|
| 2607 | + cap->mseq = 0; |
---|
2421 | 2608 | __kick_flushing_caps(mdsc, session, ci, |
---|
2422 | 2609 | oldest_flush_tid); |
---|
2423 | 2610 | } else { |
---|
.. | .. |
---|
2435 | 2622 | struct ceph_cap *cap; |
---|
2436 | 2623 | u64 oldest_flush_tid; |
---|
2437 | 2624 | |
---|
| 2625 | + lockdep_assert_held(&session->s_mutex); |
---|
| 2626 | + |
---|
2438 | 2627 | dout("kick_flushing_caps mds%d\n", session->s_mds); |
---|
2439 | 2628 | |
---|
2440 | 2629 | spin_lock(&mdsc->cap_dirty_lock); |
---|
.. | .. |
---|
2451 | 2640 | continue; |
---|
2452 | 2641 | } |
---|
2453 | 2642 | if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { |
---|
2454 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
---|
2455 | 2643 | __kick_flushing_caps(mdsc, session, ci, |
---|
2456 | 2644 | oldest_flush_tid); |
---|
2457 | 2645 | } |
---|
.. | .. |
---|
2459 | 2647 | } |
---|
2460 | 2648 | } |
---|
2461 | 2649 | |
---|
2462 | | -static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, |
---|
2463 | | - struct ceph_mds_session *session, |
---|
2464 | | - struct inode *inode) |
---|
2465 | | - __releases(ci->i_ceph_lock) |
---|
| 2650 | +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, |
---|
| 2651 | + struct ceph_inode_info *ci) |
---|
2466 | 2652 | { |
---|
2467 | | - struct ceph_inode_info *ci = ceph_inode(inode); |
---|
2468 | | - struct ceph_cap *cap; |
---|
| 2653 | + struct ceph_mds_client *mdsc = session->s_mdsc; |
---|
| 2654 | + struct ceph_cap *cap = ci->i_auth_cap; |
---|
2469 | 2655 | |
---|
2470 | | - cap = ci->i_auth_cap; |
---|
2471 | | - dout("kick_flushing_inode_caps %p flushing %s\n", inode, |
---|
| 2656 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
| 2657 | + |
---|
| 2658 | + dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, |
---|
2472 | 2659 | ceph_cap_string(ci->i_flushing_caps)); |
---|
2473 | 2660 | |
---|
2474 | 2661 | if (!list_empty(&ci->i_cap_flush_list)) { |
---|
.. | .. |
---|
2479 | 2666 | oldest_flush_tid = __get_oldest_flush_tid(mdsc); |
---|
2480 | 2667 | spin_unlock(&mdsc->cap_dirty_lock); |
---|
2481 | 2668 | |
---|
2482 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
---|
2483 | 2669 | __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); |
---|
2484 | | - spin_unlock(&ci->i_ceph_lock); |
---|
2485 | | - } else { |
---|
2486 | | - spin_unlock(&ci->i_ceph_lock); |
---|
2487 | 2670 | } |
---|
2488 | 2671 | } |
---|
2489 | 2672 | |
---|
.. | .. |
---|
2491 | 2674 | /* |
---|
2492 | 2675 | * Take references to capabilities we hold, so that we don't release |
---|
2493 | 2676 | * them to the MDS prematurely. |
---|
2494 | | - * |
---|
2495 | | - * Protected by i_ceph_lock. |
---|
2496 | 2677 | */ |
---|
2497 | | -static void __take_cap_refs(struct ceph_inode_info *ci, int got, |
---|
| 2678 | +void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, |
---|
2498 | 2679 | bool snap_rwsem_locked) |
---|
2499 | 2680 | { |
---|
| 2681 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
| 2682 | + |
---|
2500 | 2683 | if (got & CEPH_CAP_PIN) |
---|
2501 | 2684 | ci->i_pin_ref++; |
---|
2502 | 2685 | if (got & CEPH_CAP_FILE_RD) |
---|
2503 | 2686 | ci->i_rd_ref++; |
---|
2504 | 2687 | if (got & CEPH_CAP_FILE_CACHE) |
---|
2505 | 2688 | ci->i_rdcache_ref++; |
---|
| 2689 | + if (got & CEPH_CAP_FILE_EXCL) |
---|
| 2690 | + ci->i_fx_ref++; |
---|
2506 | 2691 | if (got & CEPH_CAP_FILE_WR) { |
---|
2507 | 2692 | if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { |
---|
2508 | 2693 | BUG_ON(!snap_rwsem_locked); |
---|
.. | .. |
---|
2515 | 2700 | if (ci->i_wb_ref == 0) |
---|
2516 | 2701 | ihold(&ci->vfs_inode); |
---|
2517 | 2702 | ci->i_wb_ref++; |
---|
2518 | | - dout("__take_cap_refs %p wb %d -> %d (?)\n", |
---|
| 2703 | + dout("%s %p wb %d -> %d (?)\n", __func__, |
---|
2519 | 2704 | &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); |
---|
2520 | 2705 | } |
---|
2521 | 2706 | } |
---|
.. | .. |
---|
2526 | 2711 | * to (when applicable), and check against max_size here as well. |
---|
2527 | 2712 | * Note that caller is responsible for ensuring max_size increases are |
---|
2528 | 2713 | * requested from the MDS. |
---|
| 2714 | + * |
---|
| 2715 | + * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, |
---|
| 2716 | + * or a negative error code. There are 3 speical error codes: |
---|
| 2717 | + * -EAGAIN: need to sleep but non-blocking is specified |
---|
| 2718 | + * -EFBIG: ask caller to call check_max_size() and try again. |
---|
| 2719 | + * -ESTALE: ask caller to call ceph_renew_caps() and try again. |
---|
2529 | 2720 | */ |
---|
2530 | | -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, |
---|
2531 | | - loff_t endoff, bool nonblock, int *got, int *err) |
---|
| 2721 | +enum { |
---|
| 2722 | + /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ |
---|
| 2723 | + NON_BLOCKING = (1 << 8), |
---|
| 2724 | + CHECK_FILELOCK = (1 << 9), |
---|
| 2725 | +}; |
---|
| 2726 | + |
---|
| 2727 | +static int try_get_cap_refs(struct inode *inode, int need, int want, |
---|
| 2728 | + loff_t endoff, int flags, int *got) |
---|
2532 | 2729 | { |
---|
2533 | | - struct inode *inode = &ci->vfs_inode; |
---|
| 2730 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
2534 | 2731 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
---|
2535 | 2732 | int ret = 0; |
---|
2536 | 2733 | int have, implemented; |
---|
2537 | | - int file_wanted; |
---|
2538 | 2734 | bool snap_rwsem_locked = false; |
---|
2539 | 2735 | |
---|
2540 | 2736 | dout("get_cap_refs %p need %s want %s\n", inode, |
---|
.. | .. |
---|
2543 | 2739 | again: |
---|
2544 | 2740 | spin_lock(&ci->i_ceph_lock); |
---|
2545 | 2741 | |
---|
2546 | | - /* make sure file is actually open */ |
---|
2547 | | - file_wanted = __ceph_caps_file_wanted(ci); |
---|
2548 | | - if ((file_wanted & need) != need) { |
---|
2549 | | - dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", |
---|
2550 | | - ceph_cap_string(need), ceph_cap_string(file_wanted)); |
---|
2551 | | - *err = -EBADF; |
---|
2552 | | - ret = 1; |
---|
| 2742 | + if ((flags & CHECK_FILELOCK) && |
---|
| 2743 | + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { |
---|
| 2744 | + dout("try_get_cap_refs %p error filelock\n", inode); |
---|
| 2745 | + ret = -EIO; |
---|
2553 | 2746 | goto out_unlock; |
---|
2554 | 2747 | } |
---|
2555 | 2748 | |
---|
.. | .. |
---|
2570 | 2763 | if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { |
---|
2571 | 2764 | dout("get_cap_refs %p endoff %llu > maxsize %llu\n", |
---|
2572 | 2765 | inode, endoff, ci->i_max_size); |
---|
2573 | | - if (endoff > ci->i_requested_max_size) { |
---|
2574 | | - *err = -EAGAIN; |
---|
2575 | | - ret = 1; |
---|
2576 | | - } |
---|
| 2766 | + if (endoff > ci->i_requested_max_size) |
---|
| 2767 | + ret = ci->i_auth_cap ? -EFBIG : -ESTALE; |
---|
2577 | 2768 | goto out_unlock; |
---|
2578 | 2769 | } |
---|
2579 | 2770 | /* |
---|
.. | .. |
---|
2607 | 2798 | * we can not call down_read() when |
---|
2608 | 2799 | * task isn't in TASK_RUNNING state |
---|
2609 | 2800 | */ |
---|
2610 | | - if (nonblock) { |
---|
2611 | | - *err = -EAGAIN; |
---|
2612 | | - ret = 1; |
---|
| 2801 | + if (flags & NON_BLOCKING) { |
---|
| 2802 | + ret = -EAGAIN; |
---|
2613 | 2803 | goto out_unlock; |
---|
2614 | 2804 | } |
---|
2615 | 2805 | |
---|
.. | .. |
---|
2620 | 2810 | } |
---|
2621 | 2811 | snap_rwsem_locked = true; |
---|
2622 | 2812 | } |
---|
2623 | | - *got = need | (have & want); |
---|
2624 | | - if ((need & CEPH_CAP_FILE_RD) && |
---|
| 2813 | + if ((have & want) == want) |
---|
| 2814 | + *got = need | want; |
---|
| 2815 | + else |
---|
| 2816 | + *got = need; |
---|
| 2817 | + if (S_ISREG(inode->i_mode) && |
---|
| 2818 | + (need & CEPH_CAP_FILE_RD) && |
---|
2625 | 2819 | !(*got & CEPH_CAP_FILE_CACHE)) |
---|
2626 | 2820 | ceph_disable_fscache_readpage(ci); |
---|
2627 | | - __take_cap_refs(ci, *got, true); |
---|
| 2821 | + ceph_take_cap_refs(ci, *got, true); |
---|
2628 | 2822 | ret = 1; |
---|
2629 | 2823 | } |
---|
2630 | 2824 | } else { |
---|
2631 | 2825 | int session_readonly = false; |
---|
2632 | | - if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { |
---|
| 2826 | + int mds_wanted; |
---|
| 2827 | + if (ci->i_auth_cap && |
---|
| 2828 | + (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { |
---|
2633 | 2829 | struct ceph_mds_session *s = ci->i_auth_cap->session; |
---|
2634 | 2830 | spin_lock(&s->s_cap_lock); |
---|
2635 | 2831 | session_readonly = s->s_readonly; |
---|
2636 | 2832 | spin_unlock(&s->s_cap_lock); |
---|
2637 | 2833 | } |
---|
2638 | 2834 | if (session_readonly) { |
---|
2639 | | - dout("get_cap_refs %p needed %s but mds%d readonly\n", |
---|
| 2835 | + dout("get_cap_refs %p need %s but mds%d readonly\n", |
---|
2640 | 2836 | inode, ceph_cap_string(need), ci->i_auth_cap->mds); |
---|
2641 | | - *err = -EROFS; |
---|
2642 | | - ret = 1; |
---|
| 2837 | + ret = -EROFS; |
---|
2643 | 2838 | goto out_unlock; |
---|
2644 | 2839 | } |
---|
2645 | 2840 | |
---|
2646 | | - if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { |
---|
2647 | | - int mds_wanted; |
---|
2648 | | - if (READ_ONCE(mdsc->fsc->mount_state) == |
---|
2649 | | - CEPH_MOUNT_SHUTDOWN) { |
---|
2650 | | - dout("get_cap_refs %p forced umount\n", inode); |
---|
2651 | | - *err = -EIO; |
---|
2652 | | - ret = 1; |
---|
2653 | | - goto out_unlock; |
---|
2654 | | - } |
---|
2655 | | - mds_wanted = __ceph_caps_mds_wanted(ci, false); |
---|
2656 | | - if (need & ~(mds_wanted & need)) { |
---|
2657 | | - dout("get_cap_refs %p caps were dropped" |
---|
2658 | | - " (session killed?)\n", inode); |
---|
2659 | | - *err = -ESTALE; |
---|
2660 | | - ret = 1; |
---|
2661 | | - goto out_unlock; |
---|
2662 | | - } |
---|
2663 | | - if (!(file_wanted & ~mds_wanted)) |
---|
2664 | | - ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; |
---|
| 2841 | + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
---|
| 2842 | + dout("get_cap_refs %p forced umount\n", inode); |
---|
| 2843 | + ret = -EIO; |
---|
| 2844 | + goto out_unlock; |
---|
| 2845 | + } |
---|
| 2846 | + mds_wanted = __ceph_caps_mds_wanted(ci, false); |
---|
| 2847 | + if (need & ~mds_wanted) { |
---|
| 2848 | + dout("get_cap_refs %p need %s > mds_wanted %s\n", |
---|
| 2849 | + inode, ceph_cap_string(need), |
---|
| 2850 | + ceph_cap_string(mds_wanted)); |
---|
| 2851 | + ret = -ESTALE; |
---|
| 2852 | + goto out_unlock; |
---|
2665 | 2853 | } |
---|
2666 | 2854 | |
---|
2667 | | - dout("get_cap_refs %p have %s needed %s\n", inode, |
---|
| 2855 | + dout("get_cap_refs %p have %s need %s\n", inode, |
---|
2668 | 2856 | ceph_cap_string(have), ceph_cap_string(need)); |
---|
2669 | 2857 | } |
---|
2670 | 2858 | out_unlock: |
---|
| 2859 | + |
---|
| 2860 | + __ceph_touch_fmode(ci, mdsc, flags); |
---|
| 2861 | + |
---|
2671 | 2862 | spin_unlock(&ci->i_ceph_lock); |
---|
2672 | 2863 | if (snap_rwsem_locked) |
---|
2673 | 2864 | up_read(&mdsc->snap_rwsem); |
---|
| 2865 | + |
---|
| 2866 | + if (!ret) |
---|
| 2867 | + ceph_update_cap_mis(&mdsc->metric); |
---|
| 2868 | + else if (ret == 1) |
---|
| 2869 | + ceph_update_cap_hit(&mdsc->metric); |
---|
2674 | 2870 | |
---|
2675 | 2871 | dout("get_cap_refs %p ret %d got %s\n", inode, |
---|
2676 | 2872 | ret, ceph_cap_string(*got)); |
---|
.. | .. |
---|
2705 | 2901 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
---|
2706 | 2902 | } |
---|
2707 | 2903 | |
---|
2708 | | -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) |
---|
| 2904 | +static inline int get_used_fmode(int caps) |
---|
2709 | 2905 | { |
---|
2710 | | - int ret, err = 0; |
---|
| 2906 | + int fmode = 0; |
---|
| 2907 | + if (caps & CEPH_CAP_FILE_RD) |
---|
| 2908 | + fmode |= CEPH_FILE_MODE_RD; |
---|
| 2909 | + if (caps & CEPH_CAP_FILE_WR) |
---|
| 2910 | + fmode |= CEPH_FILE_MODE_WR; |
---|
| 2911 | + return fmode; |
---|
| 2912 | +} |
---|
| 2913 | + |
---|
| 2914 | +int ceph_try_get_caps(struct inode *inode, int need, int want, |
---|
| 2915 | + bool nonblock, int *got) |
---|
| 2916 | +{ |
---|
| 2917 | + int ret, flags; |
---|
2711 | 2918 | |
---|
2712 | 2919 | BUG_ON(need & ~CEPH_CAP_FILE_RD); |
---|
2713 | | - BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); |
---|
2714 | | - ret = ceph_pool_perm_check(ci, need); |
---|
2715 | | - if (ret < 0) |
---|
2716 | | - return ret; |
---|
2717 | | - |
---|
2718 | | - ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); |
---|
2719 | | - if (ret) { |
---|
2720 | | - if (err == -EAGAIN) { |
---|
2721 | | - ret = 0; |
---|
2722 | | - } else if (err < 0) { |
---|
2723 | | - ret = err; |
---|
2724 | | - } |
---|
| 2920 | + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | |
---|
| 2921 | + CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | |
---|
| 2922 | + CEPH_CAP_ANY_DIR_OPS)); |
---|
| 2923 | + if (need) { |
---|
| 2924 | + ret = ceph_pool_perm_check(inode, need); |
---|
| 2925 | + if (ret < 0) |
---|
| 2926 | + return ret; |
---|
2725 | 2927 | } |
---|
| 2928 | + |
---|
| 2929 | + flags = get_used_fmode(need | want); |
---|
| 2930 | + if (nonblock) |
---|
| 2931 | + flags |= NON_BLOCKING; |
---|
| 2932 | + |
---|
| 2933 | + ret = try_get_cap_refs(inode, need, want, 0, flags, got); |
---|
| 2934 | + /* three special error codes */ |
---|
| 2935 | + if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) |
---|
| 2936 | + ret = 0; |
---|
2726 | 2937 | return ret; |
---|
2727 | 2938 | } |
---|
2728 | 2939 | |
---|
.. | .. |
---|
2731 | 2942 | * due to a small max_size, make sure we check_max_size (and possibly |
---|
2732 | 2943 | * ask the mds) so we don't get hung up indefinitely. |
---|
2733 | 2944 | */ |
---|
2734 | | -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, |
---|
| 2945 | +int ceph_get_caps(struct file *filp, int need, int want, |
---|
2735 | 2946 | loff_t endoff, int *got, struct page **pinned_page) |
---|
2736 | 2947 | { |
---|
2737 | | - int _got, ret, err = 0; |
---|
| 2948 | + struct ceph_file_info *fi = filp->private_data; |
---|
| 2949 | + struct inode *inode = file_inode(filp); |
---|
| 2950 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
| 2951 | + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
| 2952 | + int ret, _got, flags; |
---|
2738 | 2953 | |
---|
2739 | | - ret = ceph_pool_perm_check(ci, need); |
---|
| 2954 | + ret = ceph_pool_perm_check(inode, need); |
---|
2740 | 2955 | if (ret < 0) |
---|
2741 | 2956 | return ret; |
---|
2742 | 2957 | |
---|
2743 | | - while (true) { |
---|
2744 | | - if (endoff > 0) |
---|
2745 | | - check_max_size(&ci->vfs_inode, endoff); |
---|
| 2958 | + if ((fi->fmode & CEPH_FILE_MODE_WR) && |
---|
| 2959 | + fi->filp_gen != READ_ONCE(fsc->filp_gen)) |
---|
| 2960 | + return -EBADF; |
---|
2746 | 2961 | |
---|
2747 | | - err = 0; |
---|
| 2962 | + flags = get_used_fmode(need | want); |
---|
| 2963 | + |
---|
| 2964 | + while (true) { |
---|
| 2965 | + flags &= CEPH_FILE_MODE_MASK; |
---|
| 2966 | + if (vfs_inode_has_locks(inode)) |
---|
| 2967 | + flags |= CHECK_FILELOCK; |
---|
2748 | 2968 | _got = 0; |
---|
2749 | | - ret = try_get_cap_refs(ci, need, want, endoff, |
---|
2750 | | - false, &_got, &err); |
---|
2751 | | - if (ret) { |
---|
2752 | | - if (err == -EAGAIN) |
---|
2753 | | - continue; |
---|
2754 | | - if (err < 0) |
---|
2755 | | - ret = err; |
---|
2756 | | - } else { |
---|
| 2969 | + ret = try_get_cap_refs(inode, need, want, endoff, |
---|
| 2970 | + flags, &_got); |
---|
| 2971 | + WARN_ON_ONCE(ret == -EAGAIN); |
---|
| 2972 | + if (!ret) { |
---|
| 2973 | + struct ceph_mds_client *mdsc = fsc->mdsc; |
---|
| 2974 | + struct cap_wait cw; |
---|
2757 | 2975 | DEFINE_WAIT_FUNC(wait, woken_wake_function); |
---|
| 2976 | + |
---|
| 2977 | + cw.ino = ceph_ino(inode); |
---|
| 2978 | + cw.tgid = current->tgid; |
---|
| 2979 | + cw.need = need; |
---|
| 2980 | + cw.want = want; |
---|
| 2981 | + |
---|
| 2982 | + spin_lock(&mdsc->caps_list_lock); |
---|
| 2983 | + list_add(&cw.list, &mdsc->cap_wait_list); |
---|
| 2984 | + spin_unlock(&mdsc->caps_list_lock); |
---|
| 2985 | + |
---|
| 2986 | + /* make sure used fmode not timeout */ |
---|
| 2987 | + ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); |
---|
2758 | 2988 | add_wait_queue(&ci->i_cap_wq, &wait); |
---|
2759 | 2989 | |
---|
2760 | | - while (!try_get_cap_refs(ci, need, want, endoff, |
---|
2761 | | - true, &_got, &err)) { |
---|
| 2990 | + flags |= NON_BLOCKING; |
---|
| 2991 | + while (!(ret = try_get_cap_refs(inode, need, want, |
---|
| 2992 | + endoff, flags, &_got))) { |
---|
2762 | 2993 | if (signal_pending(current)) { |
---|
2763 | 2994 | ret = -ERESTARTSYS; |
---|
2764 | 2995 | break; |
---|
.. | .. |
---|
2767 | 2998 | } |
---|
2768 | 2999 | |
---|
2769 | 3000 | remove_wait_queue(&ci->i_cap_wq, &wait); |
---|
| 3001 | + ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); |
---|
2770 | 3002 | |
---|
2771 | | - if (err == -EAGAIN) |
---|
| 3003 | + spin_lock(&mdsc->caps_list_lock); |
---|
| 3004 | + list_del(&cw.list); |
---|
| 3005 | + spin_unlock(&mdsc->caps_list_lock); |
---|
| 3006 | + |
---|
| 3007 | + if (ret == -EAGAIN) |
---|
2772 | 3008 | continue; |
---|
2773 | | - if (err < 0) |
---|
2774 | | - ret = err; |
---|
2775 | 3009 | } |
---|
| 3010 | + |
---|
| 3011 | + if ((fi->fmode & CEPH_FILE_MODE_WR) && |
---|
| 3012 | + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { |
---|
| 3013 | + if (ret >= 0 && _got) |
---|
| 3014 | + ceph_put_cap_refs(ci, _got); |
---|
| 3015 | + return -EBADF; |
---|
| 3016 | + } |
---|
| 3017 | + |
---|
2776 | 3018 | if (ret < 0) { |
---|
2777 | | - if (err == -ESTALE) { |
---|
| 3019 | + if (ret == -EFBIG || ret == -ESTALE) { |
---|
| 3020 | + int ret2 = ceph_wait_on_async_create(inode); |
---|
| 3021 | + if (ret2 < 0) |
---|
| 3022 | + return ret2; |
---|
| 3023 | + } |
---|
| 3024 | + if (ret == -EFBIG) { |
---|
| 3025 | + check_max_size(inode, endoff); |
---|
| 3026 | + continue; |
---|
| 3027 | + } |
---|
| 3028 | + if (ret == -ESTALE) { |
---|
2778 | 3029 | /* session was killed, try renew caps */ |
---|
2779 | | - ret = ceph_renew_caps(&ci->vfs_inode); |
---|
| 3030 | + ret = ceph_renew_caps(inode, flags); |
---|
2780 | 3031 | if (ret == 0) |
---|
2781 | 3032 | continue; |
---|
2782 | 3033 | } |
---|
2783 | 3034 | return ret; |
---|
2784 | 3035 | } |
---|
2785 | 3036 | |
---|
2786 | | - if (ci->i_inline_version != CEPH_INLINE_NONE && |
---|
| 3037 | + if (S_ISREG(ci->vfs_inode.i_mode) && |
---|
| 3038 | + ci->i_inline_version != CEPH_INLINE_NONE && |
---|
2787 | 3039 | (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
---|
2788 | | - i_size_read(&ci->vfs_inode) > 0) { |
---|
| 3040 | + i_size_read(inode) > 0) { |
---|
2789 | 3041 | struct page *page = |
---|
2790 | | - find_get_page(ci->vfs_inode.i_mapping, 0); |
---|
| 3042 | + find_get_page(inode->i_mapping, 0); |
---|
2791 | 3043 | if (page) { |
---|
2792 | 3044 | if (PageUptodate(page)) { |
---|
2793 | 3045 | *pinned_page = page; |
---|
.. | .. |
---|
2806 | 3058 | * getattr request will bring inline data into |
---|
2807 | 3059 | * page cache |
---|
2808 | 3060 | */ |
---|
2809 | | - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, |
---|
| 3061 | + ret = __ceph_do_getattr(inode, NULL, |
---|
2810 | 3062 | CEPH_STAT_CAP_INLINE_DATA, |
---|
2811 | 3063 | true); |
---|
2812 | 3064 | if (ret < 0) |
---|
.. | .. |
---|
2816 | 3068 | break; |
---|
2817 | 3069 | } |
---|
2818 | 3070 | |
---|
2819 | | - if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) |
---|
| 3071 | + if (S_ISREG(ci->vfs_inode.i_mode) && |
---|
| 3072 | + (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) |
---|
2820 | 3073 | ceph_fscache_revalidate_cookie(ci); |
---|
2821 | 3074 | |
---|
2822 | 3075 | *got = _got; |
---|
.. | .. |
---|
2830 | 3083 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) |
---|
2831 | 3084 | { |
---|
2832 | 3085 | spin_lock(&ci->i_ceph_lock); |
---|
2833 | | - __take_cap_refs(ci, caps, false); |
---|
| 3086 | + ceph_take_cap_refs(ci, caps, false); |
---|
2834 | 3087 | spin_unlock(&ci->i_ceph_lock); |
---|
2835 | 3088 | } |
---|
2836 | 3089 | |
---|
.. | .. |
---|
2867 | 3120 | * If we are releasing a WR cap (from a sync write), finalize any affected |
---|
2868 | 3121 | * cap_snap, and wake up any waiters. |
---|
2869 | 3122 | */ |
---|
2870 | | -void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) |
---|
| 3123 | +static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, |
---|
| 3124 | + bool skip_checking_caps) |
---|
2871 | 3125 | { |
---|
2872 | 3126 | struct inode *inode = &ci->vfs_inode; |
---|
2873 | 3127 | int last = 0, put = 0, flushsnaps = 0, wake = 0; |
---|
.. | .. |
---|
2880 | 3134 | last++; |
---|
2881 | 3135 | if (had & CEPH_CAP_FILE_CACHE) |
---|
2882 | 3136 | if (--ci->i_rdcache_ref == 0) |
---|
| 3137 | + last++; |
---|
| 3138 | + if (had & CEPH_CAP_FILE_EXCL) |
---|
| 3139 | + if (--ci->i_fx_ref == 0) |
---|
2883 | 3140 | last++; |
---|
2884 | 3141 | if (had & CEPH_CAP_FILE_BUFFER) { |
---|
2885 | 3142 | if (--ci->i_wb_ref == 0) { |
---|
.. | .. |
---|
2912 | 3169 | ci->i_head_snapc = NULL; |
---|
2913 | 3170 | } |
---|
2914 | 3171 | /* see comment in __ceph_remove_cap() */ |
---|
2915 | | - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) |
---|
| 3172 | + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) |
---|
2916 | 3173 | drop_inode_snap_realm(ci); |
---|
2917 | 3174 | } |
---|
2918 | 3175 | spin_unlock(&ci->i_ceph_lock); |
---|
.. | .. |
---|
2920 | 3177 | dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), |
---|
2921 | 3178 | last ? " last" : "", put ? " put" : ""); |
---|
2922 | 3179 | |
---|
2923 | | - if (last && !flushsnaps) |
---|
2924 | | - ceph_check_caps(ci, 0, NULL); |
---|
2925 | | - else if (flushsnaps) |
---|
2926 | | - ceph_flush_snaps(ci, NULL); |
---|
| 3180 | + if (!skip_checking_caps) { |
---|
| 3181 | + if (last) |
---|
| 3182 | + ceph_check_caps(ci, 0, NULL); |
---|
| 3183 | + else if (flushsnaps) |
---|
| 3184 | + ceph_flush_snaps(ci, NULL); |
---|
| 3185 | + } |
---|
2927 | 3186 | if (wake) |
---|
2928 | 3187 | wake_up_all(&ci->i_cap_wq); |
---|
2929 | 3188 | while (put-- > 0) |
---|
2930 | 3189 | iput(inode); |
---|
| 3190 | +} |
---|
| 3191 | + |
---|
| 3192 | +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) |
---|
| 3193 | +{ |
---|
| 3194 | + __ceph_put_cap_refs(ci, had, false); |
---|
| 3195 | +} |
---|
| 3196 | + |
---|
| 3197 | +void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) |
---|
| 3198 | +{ |
---|
| 3199 | + __ceph_put_cap_refs(ci, had, true); |
---|
2931 | 3200 | } |
---|
2932 | 3201 | |
---|
2933 | 3202 | /* |
---|
.. | .. |
---|
2977 | 3246 | break; |
---|
2978 | 3247 | } |
---|
2979 | 3248 | } |
---|
2980 | | - BUG_ON(!found); |
---|
| 3249 | + |
---|
| 3250 | + if (!found) { |
---|
| 3251 | + /* |
---|
| 3252 | + * The capsnap should already be removed when removing |
---|
| 3253 | + * auth cap in the case of a forced unmount. |
---|
| 3254 | + */ |
---|
| 3255 | + WARN_ON_ONCE(ci->i_auth_cap); |
---|
| 3256 | + goto unlock; |
---|
| 3257 | + } |
---|
| 3258 | + |
---|
2981 | 3259 | capsnap->dirty_pages -= nr; |
---|
2982 | 3260 | if (capsnap->dirty_pages == 0) { |
---|
2983 | 3261 | complete_capsnap = true; |
---|
.. | .. |
---|
2999 | 3277 | complete_capsnap ? " (complete capsnap)" : ""); |
---|
3000 | 3278 | } |
---|
3001 | 3279 | |
---|
| 3280 | +unlock: |
---|
3002 | 3281 | spin_unlock(&ci->i_ceph_lock); |
---|
3003 | 3282 | |
---|
3004 | 3283 | if (last) { |
---|
3005 | | - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
---|
| 3284 | + ceph_check_caps(ci, 0, NULL); |
---|
3006 | 3285 | } else if (flush_snaps) { |
---|
3007 | 3286 | ceph_flush_snaps(ci, NULL); |
---|
3008 | 3287 | } |
---|
3009 | 3288 | if (complete_capsnap) |
---|
3010 | 3289 | wake_up_all(&ci->i_cap_wq); |
---|
3011 | | - while (put-- > 0) |
---|
3012 | | - iput(inode); |
---|
| 3290 | + while (put-- > 0) { |
---|
| 3291 | + /* avoid calling iput_final() in osd dispatch threads */ |
---|
| 3292 | + ceph_async_iput(inode); |
---|
| 3293 | + } |
---|
3013 | 3294 | } |
---|
3014 | 3295 | |
---|
3015 | 3296 | /* |
---|
.. | .. |
---|
3054 | 3335 | bool dirstat_valid; |
---|
3055 | 3336 | u64 nfiles; |
---|
3056 | 3337 | u64 nsubdirs; |
---|
| 3338 | + u64 change_attr; |
---|
3057 | 3339 | /* currently issued */ |
---|
3058 | 3340 | int issued; |
---|
| 3341 | + struct timespec64 btime; |
---|
3059 | 3342 | }; |
---|
3060 | 3343 | |
---|
3061 | 3344 | /* |
---|
.. | .. |
---|
3079 | 3362 | int used, wanted, dirty; |
---|
3080 | 3363 | u64 size = le64_to_cpu(grant->size); |
---|
3081 | 3364 | u64 max_size = le64_to_cpu(grant->max_size); |
---|
3082 | | - int check_caps = 0; |
---|
| 3365 | + unsigned char check_caps = 0; |
---|
| 3366 | + bool was_stale = cap->cap_gen < session->s_cap_gen; |
---|
3083 | 3367 | bool wake = false; |
---|
3084 | 3368 | bool writeback = false; |
---|
3085 | 3369 | bool queue_trunc = false; |
---|
.. | .. |
---|
3092 | 3376 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, |
---|
3093 | 3377 | inode->i_size); |
---|
3094 | 3378 | |
---|
| 3379 | + |
---|
| 3380 | + /* |
---|
| 3381 | + * If CACHE is being revoked, and we have no dirty buffers, |
---|
| 3382 | + * try to invalidate (once). (If there are dirty buffers, we |
---|
| 3383 | + * will invalidate _after_ writeback.) |
---|
| 3384 | + */ |
---|
| 3385 | + if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ |
---|
| 3386 | + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && |
---|
| 3387 | + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
---|
| 3388 | + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { |
---|
| 3389 | + if (try_nonblocking_invalidate(inode)) { |
---|
| 3390 | + /* there were locked pages.. invalidate later |
---|
| 3391 | + in a separate thread. */ |
---|
| 3392 | + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { |
---|
| 3393 | + queue_invalidate = true; |
---|
| 3394 | + ci->i_rdcache_revoking = ci->i_rdcache_gen; |
---|
| 3395 | + } |
---|
| 3396 | + } |
---|
| 3397 | + } |
---|
| 3398 | + |
---|
| 3399 | + if (was_stale) |
---|
| 3400 | + cap->issued = cap->implemented = CEPH_CAP_PIN; |
---|
3095 | 3401 | |
---|
3096 | 3402 | /* |
---|
3097 | 3403 | * auth mds of the inode changed. we received the cap export message, |
---|
.. | .. |
---|
3108 | 3414 | newcaps |= cap->issued; |
---|
3109 | 3415 | } |
---|
3110 | 3416 | |
---|
3111 | | - /* |
---|
3112 | | - * If CACHE is being revoked, and we have no dirty buffers, |
---|
3113 | | - * try to invalidate (once). (If there are dirty buffers, we |
---|
3114 | | - * will invalidate _after_ writeback.) |
---|
3115 | | - */ |
---|
3116 | | - if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ |
---|
3117 | | - ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && |
---|
3118 | | - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
---|
3119 | | - !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { |
---|
3120 | | - if (try_nonblocking_invalidate(inode)) { |
---|
3121 | | - /* there were locked pages.. invalidate later |
---|
3122 | | - in a separate thread. */ |
---|
3123 | | - if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { |
---|
3124 | | - queue_invalidate = true; |
---|
3125 | | - ci->i_rdcache_revoking = ci->i_rdcache_gen; |
---|
3126 | | - } |
---|
3127 | | - } |
---|
3128 | | - } |
---|
3129 | | - |
---|
3130 | 3417 | /* side effects now are allowed */ |
---|
3131 | 3418 | cap->cap_gen = session->s_cap_gen; |
---|
3132 | 3419 | cap->seq = seq; |
---|
3133 | 3420 | |
---|
3134 | 3421 | __check_cap_issue(ci, cap, newcaps); |
---|
3135 | 3422 | |
---|
| 3423 | + inode_set_max_iversion_raw(inode, extra_info->change_attr); |
---|
| 3424 | + |
---|
3136 | 3425 | if ((newcaps & CEPH_CAP_AUTH_SHARED) && |
---|
3137 | 3426 | (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { |
---|
3138 | 3427 | inode->i_mode = le32_to_cpu(grant->mode); |
---|
3139 | 3428 | inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); |
---|
3140 | 3429 | inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); |
---|
| 3430 | + ci->i_btime = extra_info->btime; |
---|
3141 | 3431 | dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, |
---|
3142 | 3432 | from_kuid(&init_user_ns, inode->i_uid), |
---|
3143 | 3433 | from_kgid(&init_user_ns, inode->i_gid)); |
---|
.. | .. |
---|
3164 | 3454 | ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); |
---|
3165 | 3455 | ci->i_xattrs.version = version; |
---|
3166 | 3456 | ceph_forget_all_cached_acls(inode); |
---|
| 3457 | + ceph_security_invalidate_secctx(inode); |
---|
3167 | 3458 | } |
---|
3168 | 3459 | } |
---|
3169 | 3460 | |
---|
.. | .. |
---|
3216 | 3507 | ci->i_requested_max_size = 0; |
---|
3217 | 3508 | } |
---|
3218 | 3509 | wake = true; |
---|
3219 | | - } else if (ci->i_wanted_max_size > ci->i_max_size && |
---|
3220 | | - ci->i_wanted_max_size > ci->i_requested_max_size) { |
---|
3221 | | - /* CEPH_CAP_OP_IMPORT */ |
---|
3222 | | - wake = true; |
---|
3223 | 3510 | } |
---|
3224 | 3511 | } |
---|
3225 | 3512 | |
---|
.. | .. |
---|
3231 | 3518 | ceph_cap_string(wanted), |
---|
3232 | 3519 | ceph_cap_string(used), |
---|
3233 | 3520 | ceph_cap_string(dirty)); |
---|
3234 | | - if (wanted != le32_to_cpu(grant->wanted)) { |
---|
3235 | | - dout("mds wanted %s -> %s\n", |
---|
3236 | | - ceph_cap_string(le32_to_cpu(grant->wanted)), |
---|
3237 | | - ceph_cap_string(wanted)); |
---|
3238 | | - /* imported cap may not have correct mds_wanted */ |
---|
3239 | | - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) |
---|
3240 | | - check_caps = 1; |
---|
| 3521 | + |
---|
| 3522 | + if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && |
---|
| 3523 | + (wanted & ~(cap->mds_wanted | newcaps))) { |
---|
| 3524 | + /* |
---|
| 3525 | + * If mds is importing cap, prior cap messages that update |
---|
| 3526 | + * 'wanted' may get dropped by mds (migrate seq mismatch). |
---|
| 3527 | + * |
---|
| 3528 | + * We don't send cap message to update 'wanted' if what we |
---|
| 3529 | + * want are already issued. If mds revokes caps, cap message |
---|
| 3530 | + * that releases caps also tells mds what we want. But if |
---|
| 3531 | + * caps got revoked by mds forcedly (session stale). We may |
---|
| 3532 | + * haven't told mds what we want. |
---|
| 3533 | + */ |
---|
| 3534 | + check_caps = 1; |
---|
3241 | 3535 | } |
---|
3242 | 3536 | |
---|
3243 | 3537 | /* revocation, grant, or no-op? */ |
---|
.. | .. |
---|
3248 | 3542 | ceph_cap_string(cap->issued), |
---|
3249 | 3543 | ceph_cap_string(newcaps), |
---|
3250 | 3544 | ceph_cap_string(revoking)); |
---|
3251 | | - if (revoking & used & CEPH_CAP_FILE_BUFFER) |
---|
| 3545 | + if (S_ISREG(inode->i_mode) && |
---|
| 3546 | + (revoking & used & CEPH_CAP_FILE_BUFFER)) |
---|
3252 | 3547 | writeback = true; /* initiate writeback; will delay ack */ |
---|
3253 | | - else if (revoking == CEPH_CAP_FILE_CACHE && |
---|
3254 | | - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
---|
3255 | | - queue_invalidate) |
---|
| 3548 | + else if (queue_invalidate && |
---|
| 3549 | + revoking == CEPH_CAP_FILE_CACHE && |
---|
| 3550 | + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) |
---|
3256 | 3551 | ; /* do nothing yet, invalidation will be queued */ |
---|
3257 | 3552 | else if (cap == ci->i_auth_cap) |
---|
3258 | 3553 | check_caps = 1; /* check auth cap only */ |
---|
.. | .. |
---|
3279 | 3574 | } |
---|
3280 | 3575 | BUG_ON(cap->issued & ~cap->implemented); |
---|
3281 | 3576 | |
---|
| 3577 | + /* don't let check_caps skip sending a response to MDS for revoke msgs */ |
---|
| 3578 | + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { |
---|
| 3579 | + cap->mds_wanted = 0; |
---|
| 3580 | + if (cap == ci->i_auth_cap) |
---|
| 3581 | + check_caps = 1; /* check auth cap only */ |
---|
| 3582 | + else |
---|
| 3583 | + check_caps = 2; /* check all caps */ |
---|
| 3584 | + } |
---|
| 3585 | + |
---|
3282 | 3586 | if (extra_info->inline_version > 0 && |
---|
3283 | 3587 | extra_info->inline_version >= ci->i_inline_version) { |
---|
3284 | 3588 | ci->i_inline_version = extra_info->inline_version; |
---|
.. | .. |
---|
3288 | 3592 | } |
---|
3289 | 3593 | |
---|
3290 | 3594 | if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { |
---|
3291 | | - if (newcaps & ~extra_info->issued) |
---|
3292 | | - wake = true; |
---|
3293 | | - kick_flushing_inode_caps(session->s_mdsc, session, inode); |
---|
| 3595 | + if (ci->i_auth_cap == cap) { |
---|
| 3596 | + if (newcaps & ~extra_info->issued) |
---|
| 3597 | + wake = true; |
---|
| 3598 | + |
---|
| 3599 | + if (ci->i_requested_max_size > max_size || |
---|
| 3600 | + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { |
---|
| 3601 | + /* re-request max_size if necessary */ |
---|
| 3602 | + ci->i_requested_max_size = 0; |
---|
| 3603 | + wake = true; |
---|
| 3604 | + } |
---|
| 3605 | + |
---|
| 3606 | + ceph_kick_flushing_inode_caps(session, ci); |
---|
| 3607 | + } |
---|
3294 | 3608 | up_read(&session->s_mdsc->snap_rwsem); |
---|
3295 | | - } else { |
---|
3296 | | - spin_unlock(&ci->i_ceph_lock); |
---|
3297 | 3609 | } |
---|
| 3610 | + spin_unlock(&ci->i_ceph_lock); |
---|
3298 | 3611 | |
---|
3299 | 3612 | if (fill_inline) |
---|
3300 | 3613 | ceph_fill_inline_data(inode, NULL, extra_info->inline_data, |
---|
.. | .. |
---|
3318 | 3631 | wake_up_all(&ci->i_cap_wq); |
---|
3319 | 3632 | |
---|
3320 | 3633 | if (check_caps == 1) |
---|
3321 | | - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, |
---|
| 3634 | + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, |
---|
3322 | 3635 | session); |
---|
3323 | 3636 | else if (check_caps == 2) |
---|
3324 | | - ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); |
---|
| 3637 | + ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); |
---|
3325 | 3638 | else |
---|
3326 | 3639 | mutex_unlock(&session->s_mutex); |
---|
3327 | 3640 | } |
---|
.. | .. |
---|
3348 | 3661 | bool wake_mdsc = false; |
---|
3349 | 3662 | |
---|
3350 | 3663 | list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { |
---|
| 3664 | + /* Is this the one that was flushed? */ |
---|
3351 | 3665 | if (cf->tid == flush_tid) |
---|
3352 | 3666 | cleaned = cf->caps; |
---|
3353 | | - if (cf->caps == 0) /* capsnap */ |
---|
| 3667 | + |
---|
| 3668 | + /* Is this a capsnap? */ |
---|
| 3669 | + if (cf->is_capsnap) |
---|
3354 | 3670 | continue; |
---|
| 3671 | + |
---|
3355 | 3672 | if (cf->tid <= flush_tid) { |
---|
3356 | | - if (__finish_cap_flush(NULL, ci, cf)) |
---|
3357 | | - wake_ci = true; |
---|
| 3673 | + /* |
---|
| 3674 | + * An earlier or current tid. The FLUSH_ACK should |
---|
| 3675 | + * represent a superset of this flush's caps. |
---|
| 3676 | + */ |
---|
| 3677 | + wake_ci |= __detach_cap_flush_from_ci(ci, cf); |
---|
3358 | 3678 | list_add_tail(&cf->i_list, &to_remove); |
---|
3359 | 3679 | } else { |
---|
| 3680 | + /* |
---|
| 3681 | + * This is a later one. Any caps in it are still dirty |
---|
| 3682 | + * so don't count them as cleaned. |
---|
| 3683 | + */ |
---|
3360 | 3684 | cleaned &= ~cf->caps; |
---|
3361 | 3685 | if (!cleaned) |
---|
3362 | 3686 | break; |
---|
.. | .. |
---|
3376 | 3700 | |
---|
3377 | 3701 | spin_lock(&mdsc->cap_dirty_lock); |
---|
3378 | 3702 | |
---|
3379 | | - list_for_each_entry(cf, &to_remove, i_list) { |
---|
3380 | | - if (__finish_cap_flush(mdsc, NULL, cf)) |
---|
3381 | | - wake_mdsc = true; |
---|
3382 | | - } |
---|
| 3703 | + list_for_each_entry(cf, &to_remove, i_list) |
---|
| 3704 | + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); |
---|
3383 | 3705 | |
---|
3384 | 3706 | if (ci->i_flushing_caps == 0) { |
---|
3385 | 3707 | if (list_empty(&ci->i_cap_flush_list)) { |
---|
.. | .. |
---|
3417 | 3739 | while (!list_empty(&to_remove)) { |
---|
3418 | 3740 | cf = list_first_entry(&to_remove, |
---|
3419 | 3741 | struct ceph_cap_flush, i_list); |
---|
3420 | | - list_del(&cf->i_list); |
---|
3421 | | - ceph_free_cap_flush(cf); |
---|
| 3742 | + list_del_init(&cf->i_list); |
---|
| 3743 | + if (!cf->is_capsnap) |
---|
| 3744 | + ceph_free_cap_flush(cf); |
---|
3422 | 3745 | } |
---|
3423 | 3746 | |
---|
3424 | 3747 | if (wake_ci) |
---|
.. | .. |
---|
3427 | 3750 | wake_up_all(&mdsc->cap_flushing_wq); |
---|
3428 | 3751 | if (drop) |
---|
3429 | 3752 | iput(inode); |
---|
| 3753 | +} |
---|
| 3754 | + |
---|
| 3755 | +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, |
---|
| 3756 | + bool *wake_ci, bool *wake_mdsc) |
---|
| 3757 | +{ |
---|
| 3758 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
| 3759 | + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
---|
| 3760 | + bool ret; |
---|
| 3761 | + |
---|
| 3762 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
| 3763 | + |
---|
| 3764 | + dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); |
---|
| 3765 | + |
---|
| 3766 | + list_del_init(&capsnap->ci_item); |
---|
| 3767 | + ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); |
---|
| 3768 | + if (wake_ci) |
---|
| 3769 | + *wake_ci = ret; |
---|
| 3770 | + |
---|
| 3771 | + spin_lock(&mdsc->cap_dirty_lock); |
---|
| 3772 | + if (list_empty(&ci->i_cap_flush_list)) |
---|
| 3773 | + list_del_init(&ci->i_flushing_item); |
---|
| 3774 | + |
---|
| 3775 | + ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); |
---|
| 3776 | + if (wake_mdsc) |
---|
| 3777 | + *wake_mdsc = ret; |
---|
| 3778 | + spin_unlock(&mdsc->cap_dirty_lock); |
---|
| 3779 | +} |
---|
| 3780 | + |
---|
| 3781 | +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, |
---|
| 3782 | + bool *wake_ci, bool *wake_mdsc) |
---|
| 3783 | +{ |
---|
| 3784 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
| 3785 | + |
---|
| 3786 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
| 3787 | + |
---|
| 3788 | + WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); |
---|
| 3789 | + __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); |
---|
3430 | 3790 | } |
---|
3431 | 3791 | |
---|
3432 | 3792 | /* |
---|
.. | .. |
---|
3466 | 3826 | capsnap, capsnap->follows); |
---|
3467 | 3827 | } |
---|
3468 | 3828 | } |
---|
3469 | | - if (flushed) { |
---|
3470 | | - WARN_ON(capsnap->dirty_pages || capsnap->writing); |
---|
3471 | | - dout(" removing %p cap_snap %p follows %lld\n", |
---|
3472 | | - inode, capsnap, follows); |
---|
3473 | | - list_del(&capsnap->ci_item); |
---|
3474 | | - if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush)) |
---|
3475 | | - wake_ci = true; |
---|
3476 | | - |
---|
3477 | | - spin_lock(&mdsc->cap_dirty_lock); |
---|
3478 | | - |
---|
3479 | | - if (list_empty(&ci->i_cap_flush_list)) |
---|
3480 | | - list_del_init(&ci->i_flushing_item); |
---|
3481 | | - |
---|
3482 | | - if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush)) |
---|
3483 | | - wake_mdsc = true; |
---|
3484 | | - |
---|
3485 | | - spin_unlock(&mdsc->cap_dirty_lock); |
---|
3486 | | - } |
---|
| 3829 | + if (flushed) |
---|
| 3830 | + ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); |
---|
3487 | 3831 | spin_unlock(&ci->i_ceph_lock); |
---|
| 3832 | + |
---|
3488 | 3833 | if (flushed) { |
---|
3489 | 3834 | ceph_put_snap_context(capsnap->context); |
---|
3490 | 3835 | ceph_put_cap_snap(capsnap); |
---|
.. | .. |
---|
3501 | 3846 | * |
---|
3502 | 3847 | * caller hold s_mutex. |
---|
3503 | 3848 | */ |
---|
3504 | | -static void handle_cap_trunc(struct inode *inode, |
---|
| 3849 | +static bool handle_cap_trunc(struct inode *inode, |
---|
3505 | 3850 | struct ceph_mds_caps *trunc, |
---|
3506 | 3851 | struct ceph_mds_session *session) |
---|
3507 | | - __releases(ci->i_ceph_lock) |
---|
3508 | 3852 | { |
---|
3509 | 3853 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
3510 | 3854 | int mds = session->s_mds; |
---|
.. | .. |
---|
3515 | 3859 | int implemented = 0; |
---|
3516 | 3860 | int dirty = __ceph_caps_dirty(ci); |
---|
3517 | 3861 | int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); |
---|
3518 | | - int queue_trunc = 0; |
---|
| 3862 | + bool queue_trunc = false; |
---|
| 3863 | + |
---|
| 3864 | + lockdep_assert_held(&ci->i_ceph_lock); |
---|
3519 | 3865 | |
---|
3520 | 3866 | issued |= implemented | dirty; |
---|
3521 | 3867 | |
---|
.. | .. |
---|
3523 | 3869 | inode, mds, seq, truncate_size, truncate_seq); |
---|
3524 | 3870 | queue_trunc = ceph_fill_file_size(inode, issued, |
---|
3525 | 3871 | truncate_seq, truncate_size, size); |
---|
3526 | | - spin_unlock(&ci->i_ceph_lock); |
---|
3527 | | - |
---|
3528 | | - if (queue_trunc) |
---|
3529 | | - ceph_queue_vmtruncate(inode); |
---|
| 3872 | + return queue_trunc; |
---|
3530 | 3873 | } |
---|
3531 | 3874 | |
---|
3532 | 3875 | /* |
---|
.. | .. |
---|
3571 | 3914 | |
---|
3572 | 3915 | if (target < 0) { |
---|
3573 | 3916 | __ceph_remove_cap(cap, false); |
---|
3574 | | - if (!ci->i_auth_cap) |
---|
3575 | | - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; |
---|
3576 | 3917 | goto out_unlock; |
---|
3577 | 3918 | } |
---|
3578 | 3919 | |
---|
.. | .. |
---|
3602 | 3943 | tcap->issue_seq = t_seq - 1; |
---|
3603 | 3944 | tcap->issued |= issued; |
---|
3604 | 3945 | tcap->implemented |= issued; |
---|
3605 | | - if (cap == ci->i_auth_cap) |
---|
| 3946 | + if (cap == ci->i_auth_cap) { |
---|
3606 | 3947 | ci->i_auth_cap = tcap; |
---|
3607 | | - |
---|
3608 | | - if (!list_empty(&ci->i_cap_flush_list) && |
---|
3609 | | - ci->i_auth_cap == tcap) { |
---|
3610 | | - spin_lock(&mdsc->cap_dirty_lock); |
---|
3611 | | - list_move_tail(&ci->i_flushing_item, |
---|
3612 | | - &tcap->session->s_cap_flushing); |
---|
3613 | | - spin_unlock(&mdsc->cap_dirty_lock); |
---|
| 3948 | + change_auth_cap_ses(ci, tcap->session); |
---|
3614 | 3949 | } |
---|
3615 | 3950 | } |
---|
3616 | 3951 | __ceph_remove_cap(cap, false); |
---|
.. | .. |
---|
3619 | 3954 | /* add placeholder for the export tagert */ |
---|
3620 | 3955 | int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; |
---|
3621 | 3956 | tcap = new_cap; |
---|
3622 | | - ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, |
---|
| 3957 | + ceph_add_cap(inode, tsession, t_cap_id, issued, 0, |
---|
3623 | 3958 | t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); |
---|
3624 | 3959 | |
---|
3625 | 3960 | if (!list_empty(&ci->i_cap_flush_list) && |
---|
.. | .. |
---|
3679 | 4014 | struct ceph_mds_cap_peer *ph, |
---|
3680 | 4015 | struct ceph_mds_session *session, |
---|
3681 | 4016 | struct ceph_cap **target_cap, int *old_issued) |
---|
3682 | | - __acquires(ci->i_ceph_lock) |
---|
3683 | 4017 | { |
---|
3684 | 4018 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
3685 | 4019 | struct ceph_cap *cap, *ocap, *new_cap = NULL; |
---|
.. | .. |
---|
3704 | 4038 | |
---|
3705 | 4039 | dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", |
---|
3706 | 4040 | inode, ci, mds, mseq, peer); |
---|
3707 | | - |
---|
3708 | 4041 | retry: |
---|
3709 | | - spin_lock(&ci->i_ceph_lock); |
---|
3710 | 4042 | cap = __get_cap_for_mds(ci, mds); |
---|
3711 | 4043 | if (!cap) { |
---|
3712 | 4044 | if (!new_cap) { |
---|
3713 | 4045 | spin_unlock(&ci->i_ceph_lock); |
---|
3714 | 4046 | new_cap = ceph_get_cap(mdsc, NULL); |
---|
| 4047 | + spin_lock(&ci->i_ceph_lock); |
---|
3715 | 4048 | goto retry; |
---|
3716 | 4049 | } |
---|
3717 | 4050 | cap = new_cap; |
---|
.. | .. |
---|
3725 | 4058 | __ceph_caps_issued(ci, &issued); |
---|
3726 | 4059 | issued |= __ceph_caps_dirty(ci); |
---|
3727 | 4060 | |
---|
3728 | | - ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, |
---|
| 4061 | + ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, |
---|
3729 | 4062 | realmino, CEPH_CAP_FLAG_AUTH, &new_cap); |
---|
3730 | 4063 | |
---|
3731 | 4064 | ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; |
---|
.. | .. |
---|
3745 | 4078 | } |
---|
3746 | 4079 | __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); |
---|
3747 | 4080 | } |
---|
3748 | | - |
---|
3749 | | - /* make sure we re-request max_size, if necessary */ |
---|
3750 | | - ci->i_requested_max_size = 0; |
---|
3751 | 4081 | |
---|
3752 | 4082 | *old_issued = issued; |
---|
3753 | 4083 | *target_cap = cap; |
---|
.. | .. |
---|
3777 | 4107 | size_t snaptrace_len; |
---|
3778 | 4108 | void *p, *end; |
---|
3779 | 4109 | struct cap_extra_info extra_info = {}; |
---|
| 4110 | + bool queue_trunc; |
---|
3780 | 4111 | |
---|
3781 | 4112 | dout("handle_caps from mds%d\n", session->s_mds); |
---|
3782 | 4113 | |
---|
.. | .. |
---|
3852 | 4183 | } |
---|
3853 | 4184 | } |
---|
3854 | 4185 | |
---|
3855 | | - if (msg_version >= 11) { |
---|
| 4186 | + if (msg_version >= 9) { |
---|
3856 | 4187 | struct ceph_timespec *btime; |
---|
3857 | | - u64 change_attr; |
---|
3858 | | - u32 flags; |
---|
3859 | 4188 | |
---|
3860 | | - /* version >= 9 */ |
---|
3861 | 4189 | if (p + sizeof(*btime) > end) |
---|
3862 | 4190 | goto bad; |
---|
3863 | 4191 | btime = p; |
---|
| 4192 | + ceph_decode_timespec64(&extra_info.btime, btime); |
---|
3864 | 4193 | p += sizeof(*btime); |
---|
3865 | | - ceph_decode_64_safe(&p, end, change_attr, bad); |
---|
| 4194 | + ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); |
---|
| 4195 | + } |
---|
| 4196 | + |
---|
| 4197 | + if (msg_version >= 11) { |
---|
| 4198 | + u32 flags; |
---|
3866 | 4199 | /* version >= 10 */ |
---|
3867 | 4200 | ceph_decode_32_safe(&p, end, flags, bad); |
---|
3868 | 4201 | /* version >= 11 */ |
---|
.. | .. |
---|
3878 | 4211 | vino.snap, inode); |
---|
3879 | 4212 | |
---|
3880 | 4213 | mutex_lock(&session->s_mutex); |
---|
3881 | | - session->s_seq++; |
---|
| 4214 | + inc_session_sequence(session); |
---|
3882 | 4215 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, |
---|
3883 | 4216 | (unsigned)seq); |
---|
3884 | 4217 | |
---|
.. | .. |
---|
3894 | 4227 | cap->seq = seq; |
---|
3895 | 4228 | cap->issue_seq = seq; |
---|
3896 | 4229 | spin_lock(&session->s_cap_lock); |
---|
3897 | | - list_add_tail(&cap->session_caps, |
---|
3898 | | - &session->s_cap_releases); |
---|
3899 | | - session->s_num_cap_releases++; |
---|
| 4230 | + __ceph_queue_cap_release(session, cap); |
---|
3900 | 4231 | spin_unlock(&session->s_cap_lock); |
---|
3901 | 4232 | } |
---|
3902 | 4233 | goto flush_cap_releases; |
---|
.. | .. |
---|
3924 | 4255 | } else { |
---|
3925 | 4256 | down_read(&mdsc->snap_rwsem); |
---|
3926 | 4257 | } |
---|
| 4258 | + spin_lock(&ci->i_ceph_lock); |
---|
3927 | 4259 | handle_cap_import(mdsc, inode, h, peer, session, |
---|
3928 | 4260 | &cap, &extra_info.issued); |
---|
3929 | 4261 | handle_cap_grant(inode, session, cap, |
---|
.. | .. |
---|
3960 | 4292 | break; |
---|
3961 | 4293 | |
---|
3962 | 4294 | case CEPH_CAP_OP_TRUNC: |
---|
3963 | | - handle_cap_trunc(inode, h, session); |
---|
| 4295 | + queue_trunc = handle_cap_trunc(inode, h, session); |
---|
| 4296 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 4297 | + if (queue_trunc) |
---|
| 4298 | + ceph_queue_vmtruncate(inode); |
---|
3964 | 4299 | break; |
---|
3965 | 4300 | |
---|
3966 | 4301 | default: |
---|
.. | .. |
---|
3969 | 4304 | ceph_cap_op_name(op)); |
---|
3970 | 4305 | } |
---|
3971 | 4306 | |
---|
3972 | | - goto done; |
---|
| 4307 | +done: |
---|
| 4308 | + mutex_unlock(&session->s_mutex); |
---|
| 4309 | +done_unlocked: |
---|
| 4310 | + ceph_put_string(extra_info.pool_ns); |
---|
| 4311 | + /* avoid calling iput_final() in mds dispatch threads */ |
---|
| 4312 | + ceph_async_iput(inode); |
---|
| 4313 | + return; |
---|
3973 | 4314 | |
---|
3974 | 4315 | flush_cap_releases: |
---|
3975 | 4316 | /* |
---|
.. | .. |
---|
3977 | 4318 | * along for the mds (who clearly thinks we still have this |
---|
3978 | 4319 | * cap). |
---|
3979 | 4320 | */ |
---|
3980 | | - ceph_send_cap_releases(mdsc, session); |
---|
3981 | | - |
---|
3982 | | -done: |
---|
3983 | | - mutex_unlock(&session->s_mutex); |
---|
3984 | | -done_unlocked: |
---|
3985 | | - iput(inode); |
---|
3986 | | - ceph_put_string(extra_info.pool_ns); |
---|
3987 | | - return; |
---|
| 4321 | + ceph_flush_cap_releases(mdsc, session); |
---|
| 4322 | + goto done; |
---|
3988 | 4323 | |
---|
3989 | 4324 | bad: |
---|
3990 | 4325 | pr_err("ceph_handle_caps: corrupt message\n"); |
---|
.. | .. |
---|
3994 | 4329 | |
---|
3995 | 4330 | /* |
---|
3996 | 4331 | * Delayed work handler to process end of delayed cap release LRU list. |
---|
| 4332 | + * |
---|
| 4333 | + * If new caps are added to the list while processing it, these won't get |
---|
| 4334 | + * processed in this run. In this case, the ci->i_hold_caps_max will be |
---|
| 4335 | + * returned so that the work can be scheduled accordingly. |
---|
3997 | 4336 | */ |
---|
3998 | | -void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) |
---|
| 4337 | +unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) |
---|
3999 | 4338 | { |
---|
4000 | 4339 | struct inode *inode; |
---|
4001 | 4340 | struct ceph_inode_info *ci; |
---|
4002 | | - int flags = CHECK_CAPS_NODELAY; |
---|
| 4341 | + struct ceph_mount_options *opt = mdsc->fsc->mount_options; |
---|
| 4342 | + unsigned long delay_max = opt->caps_wanted_delay_max * HZ; |
---|
| 4343 | + unsigned long loop_start = jiffies; |
---|
| 4344 | + unsigned long delay = 0; |
---|
4003 | 4345 | |
---|
4004 | 4346 | dout("check_delayed_caps\n"); |
---|
4005 | | - while (1) { |
---|
4006 | | - spin_lock(&mdsc->cap_delay_lock); |
---|
4007 | | - if (list_empty(&mdsc->cap_delay_list)) |
---|
4008 | | - break; |
---|
| 4347 | + spin_lock(&mdsc->cap_delay_lock); |
---|
| 4348 | + while (!list_empty(&mdsc->cap_delay_list)) { |
---|
4009 | 4349 | ci = list_first_entry(&mdsc->cap_delay_list, |
---|
4010 | 4350 | struct ceph_inode_info, |
---|
4011 | 4351 | i_cap_delay_list); |
---|
| 4352 | + if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { |
---|
| 4353 | + dout("%s caps added recently. Exiting loop", __func__); |
---|
| 4354 | + delay = ci->i_hold_caps_max; |
---|
| 4355 | + break; |
---|
| 4356 | + } |
---|
4012 | 4357 | if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && |
---|
4013 | 4358 | time_before(jiffies, ci->i_hold_caps_max)) |
---|
4014 | 4359 | break; |
---|
4015 | 4360 | list_del_init(&ci->i_cap_delay_list); |
---|
4016 | 4361 | |
---|
4017 | 4362 | inode = igrab(&ci->vfs_inode); |
---|
4018 | | - spin_unlock(&mdsc->cap_delay_lock); |
---|
4019 | | - |
---|
4020 | 4363 | if (inode) { |
---|
| 4364 | + spin_unlock(&mdsc->cap_delay_lock); |
---|
4021 | 4365 | dout("check_delayed_caps on %p\n", inode); |
---|
4022 | | - ceph_check_caps(ci, flags, NULL); |
---|
4023 | | - iput(inode); |
---|
| 4366 | + ceph_check_caps(ci, 0, NULL); |
---|
| 4367 | + /* avoid calling iput_final() in tick thread */ |
---|
| 4368 | + ceph_async_iput(inode); |
---|
| 4369 | + spin_lock(&mdsc->cap_delay_lock); |
---|
4024 | 4370 | } |
---|
4025 | 4371 | } |
---|
4026 | 4372 | spin_unlock(&mdsc->cap_delay_lock); |
---|
| 4373 | + |
---|
| 4374 | + return delay; |
---|
4027 | 4375 | } |
---|
4028 | 4376 | |
---|
4029 | 4377 | /* |
---|
4030 | 4378 | * Flush all dirty caps to the mds |
---|
4031 | 4379 | */ |
---|
4032 | | -void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) |
---|
| 4380 | +static void flush_dirty_session_caps(struct ceph_mds_session *s) |
---|
4033 | 4381 | { |
---|
| 4382 | + struct ceph_mds_client *mdsc = s->s_mdsc; |
---|
4034 | 4383 | struct ceph_inode_info *ci; |
---|
4035 | 4384 | struct inode *inode; |
---|
4036 | 4385 | |
---|
4037 | 4386 | dout("flush_dirty_caps\n"); |
---|
4038 | 4387 | spin_lock(&mdsc->cap_dirty_lock); |
---|
4039 | | - while (!list_empty(&mdsc->cap_dirty)) { |
---|
4040 | | - ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, |
---|
| 4388 | + while (!list_empty(&s->s_cap_dirty)) { |
---|
| 4389 | + ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, |
---|
4041 | 4390 | i_dirty_item); |
---|
4042 | 4391 | inode = &ci->vfs_inode; |
---|
4043 | 4392 | ihold(inode); |
---|
4044 | 4393 | dout("flush_dirty_caps %p\n", inode); |
---|
4045 | 4394 | spin_unlock(&mdsc->cap_dirty_lock); |
---|
4046 | | - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); |
---|
| 4395 | + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); |
---|
4047 | 4396 | iput(inode); |
---|
4048 | 4397 | spin_lock(&mdsc->cap_dirty_lock); |
---|
4049 | 4398 | } |
---|
.. | .. |
---|
4051 | 4400 | dout("flush_dirty_caps done\n"); |
---|
4052 | 4401 | } |
---|
4053 | 4402 | |
---|
4054 | | -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) |
---|
| 4403 | +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) |
---|
4055 | 4404 | { |
---|
4056 | | - int i; |
---|
| 4405 | + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); |
---|
| 4406 | +} |
---|
| 4407 | + |
---|
| 4408 | +void __ceph_touch_fmode(struct ceph_inode_info *ci, |
---|
| 4409 | + struct ceph_mds_client *mdsc, int fmode) |
---|
| 4410 | +{ |
---|
| 4411 | + unsigned long now = jiffies; |
---|
| 4412 | + if (fmode & CEPH_FILE_MODE_RD) |
---|
| 4413 | + ci->i_last_rd = now; |
---|
| 4414 | + if (fmode & CEPH_FILE_MODE_WR) |
---|
| 4415 | + ci->i_last_wr = now; |
---|
| 4416 | + /* queue periodic check */ |
---|
| 4417 | + if (fmode && |
---|
| 4418 | + __ceph_is_any_real_caps(ci) && |
---|
| 4419 | + list_empty(&ci->i_cap_delay_list)) |
---|
| 4420 | + __cap_delay_requeue(mdsc, ci); |
---|
| 4421 | +} |
---|
| 4422 | + |
---|
| 4423 | +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) |
---|
| 4424 | +{ |
---|
| 4425 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); |
---|
4057 | 4426 | int bits = (fmode << 1) | 1; |
---|
| 4427 | + bool already_opened = false; |
---|
| 4428 | + int i; |
---|
| 4429 | + |
---|
| 4430 | + if (count == 1) |
---|
| 4431 | + atomic64_inc(&mdsc->metric.opened_files); |
---|
| 4432 | + |
---|
| 4433 | + spin_lock(&ci->i_ceph_lock); |
---|
4058 | 4434 | for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { |
---|
| 4435 | + /* |
---|
| 4436 | + * If any of the mode ref is larger than 0, |
---|
| 4437 | + * that means it has been already opened by |
---|
| 4438 | + * others. Just skip checking the PIN ref. |
---|
| 4439 | + */ |
---|
| 4440 | + if (i && ci->i_nr_by_mode[i]) |
---|
| 4441 | + already_opened = true; |
---|
| 4442 | + |
---|
4059 | 4443 | if (bits & (1 << i)) |
---|
4060 | | - ci->i_nr_by_mode[i]++; |
---|
| 4444 | + ci->i_nr_by_mode[i] += count; |
---|
4061 | 4445 | } |
---|
| 4446 | + |
---|
| 4447 | + if (!already_opened) |
---|
| 4448 | + percpu_counter_inc(&mdsc->metric.opened_inodes); |
---|
| 4449 | + spin_unlock(&ci->i_ceph_lock); |
---|
4062 | 4450 | } |
---|
4063 | 4451 | |
---|
4064 | 4452 | /* |
---|
.. | .. |
---|
4066 | 4454 | * we may need to release capabilities to the MDS (or schedule |
---|
4067 | 4455 | * their delayed release). |
---|
4068 | 4456 | */ |
---|
4069 | | -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) |
---|
| 4457 | +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) |
---|
4070 | 4458 | { |
---|
4071 | | - int i, last = 0; |
---|
| 4459 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); |
---|
4072 | 4460 | int bits = (fmode << 1) | 1; |
---|
| 4461 | + bool is_closed = true; |
---|
| 4462 | + int i; |
---|
| 4463 | + |
---|
| 4464 | + if (count == 1) |
---|
| 4465 | + atomic64_dec(&mdsc->metric.opened_files); |
---|
| 4466 | + |
---|
4073 | 4467 | spin_lock(&ci->i_ceph_lock); |
---|
4074 | 4468 | for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { |
---|
4075 | 4469 | if (bits & (1 << i)) { |
---|
4076 | | - BUG_ON(ci->i_nr_by_mode[i] == 0); |
---|
4077 | | - if (--ci->i_nr_by_mode[i] == 0) |
---|
4078 | | - last++; |
---|
| 4470 | + BUG_ON(ci->i_nr_by_mode[i] < count); |
---|
| 4471 | + ci->i_nr_by_mode[i] -= count; |
---|
4079 | 4472 | } |
---|
4080 | | - } |
---|
4081 | | - dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", |
---|
4082 | | - &ci->vfs_inode, fmode, |
---|
4083 | | - ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], |
---|
4084 | | - ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); |
---|
4085 | | - spin_unlock(&ci->i_ceph_lock); |
---|
4086 | 4473 | |
---|
4087 | | - if (last && ci->i_vino.snap == CEPH_NOSNAP) |
---|
4088 | | - ceph_check_caps(ci, 0, NULL); |
---|
| 4474 | + /* |
---|
| 4475 | + * If any of the mode ref is not 0 after |
---|
| 4476 | + * decreased, that means it is still opened |
---|
| 4477 | + * by others. Just skip checking the PIN ref. |
---|
| 4478 | + */ |
---|
| 4479 | + if (i && ci->i_nr_by_mode[i]) |
---|
| 4480 | + is_closed = false; |
---|
| 4481 | + } |
---|
| 4482 | + |
---|
| 4483 | + if (is_closed) |
---|
| 4484 | + percpu_counter_dec(&mdsc->metric.opened_inodes); |
---|
| 4485 | + spin_unlock(&ci->i_ceph_lock); |
---|
4089 | 4486 | } |
---|
4090 | 4487 | |
---|
4091 | 4488 | /* |
---|
4092 | | - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it |
---|
| 4489 | + * For a soon-to-be unlinked file, drop the LINK caps. If it |
---|
4093 | 4490 | * looks like the link count will hit 0, drop any other caps (other |
---|
4094 | 4491 | * than PIN) we don't specifically want (due to the file still being |
---|
4095 | 4492 | * open). |
---|
.. | .. |
---|
4103 | 4500 | if (inode->i_nlink == 1) { |
---|
4104 | 4501 | drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); |
---|
4105 | 4502 | |
---|
4106 | | - ci->i_ceph_flags |= CEPH_I_NODELAY; |
---|
4107 | 4503 | if (__ceph_caps_dirty(ci)) { |
---|
4108 | 4504 | struct ceph_mds_client *mdsc = |
---|
4109 | 4505 | ceph_inode_to_client(inode)->mdsc; |
---|
.. | .. |
---|
4159 | 4555 | if (force || (cap->issued & drop)) { |
---|
4160 | 4556 | if (cap->issued & drop) { |
---|
4161 | 4557 | int wanted = __ceph_caps_wanted(ci); |
---|
4162 | | - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) |
---|
4163 | | - wanted |= cap->mds_wanted; |
---|
4164 | 4558 | dout("encode_inode_release %p cap %p " |
---|
4165 | 4559 | "%s -> %s, wanted %s -> %s\n", inode, cap, |
---|
4166 | 4560 | ceph_cap_string(cap->issued), |
---|
.. | .. |
---|
4171 | 4565 | cap->issued &= ~drop; |
---|
4172 | 4566 | cap->implemented &= ~drop; |
---|
4173 | 4567 | cap->mds_wanted = wanted; |
---|
| 4568 | + if (cap == ci->i_auth_cap && |
---|
| 4569 | + !(wanted & CEPH_CAP_ANY_FILE_WR)) |
---|
| 4570 | + ci->i_requested_max_size = 0; |
---|
4174 | 4571 | } else { |
---|
4175 | 4572 | dout("encode_inode_release %p cap %p %s" |
---|
4176 | 4573 | " (force)\n", inode, cap, |
---|