| .. | .. |
|---|
| 8 | 8 | #include <linux/vmalloc.h> |
|---|
| 9 | 9 | #include <linux/wait.h> |
|---|
| 10 | 10 | #include <linux/writeback.h> |
|---|
| 11 | +#include <linux/iversion.h> |
|---|
| 11 | 12 | |
|---|
| 12 | 13 | #include "super.h" |
|---|
| 13 | 14 | #include "mds_client.h" |
|---|
| .. | .. |
|---|
| 148 | 149 | spin_unlock(&mdsc->caps_list_lock); |
|---|
| 149 | 150 | } |
|---|
| 150 | 151 | |
|---|
| 151 | | -void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) |
|---|
| 152 | +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, |
|---|
| 153 | + struct ceph_mount_options *fsopt) |
|---|
| 152 | 154 | { |
|---|
| 153 | 155 | spin_lock(&mdsc->caps_list_lock); |
|---|
| 154 | | - mdsc->caps_min_count += delta; |
|---|
| 155 | | - BUG_ON(mdsc->caps_min_count < 0); |
|---|
| 156 | + mdsc->caps_min_count = fsopt->max_readdir; |
|---|
| 157 | + if (mdsc->caps_min_count < 1024) |
|---|
| 158 | + mdsc->caps_min_count = 1024; |
|---|
| 159 | + mdsc->caps_use_max = fsopt->caps_max; |
|---|
| 160 | + if (mdsc->caps_use_max > 0 && |
|---|
| 161 | + mdsc->caps_use_max < mdsc->caps_min_count) |
|---|
| 162 | + mdsc->caps_use_max = mdsc->caps_min_count; |
|---|
| 156 | 163 | spin_unlock(&mdsc->caps_list_lock); |
|---|
| 157 | 164 | } |
|---|
| 158 | 165 | |
|---|
| .. | .. |
|---|
| 272 | 279 | if (!err) { |
|---|
| 273 | 280 | BUG_ON(have + alloc != need); |
|---|
| 274 | 281 | ctx->count = need; |
|---|
| 282 | + ctx->used = 0; |
|---|
| 275 | 283 | } |
|---|
| 276 | 284 | |
|---|
| 277 | 285 | spin_lock(&mdsc->caps_list_lock); |
|---|
| .. | .. |
|---|
| 295 | 303 | } |
|---|
| 296 | 304 | |
|---|
| 297 | 305 | void ceph_unreserve_caps(struct ceph_mds_client *mdsc, |
|---|
| 298 | | - struct ceph_cap_reservation *ctx) |
|---|
| 306 | + struct ceph_cap_reservation *ctx) |
|---|
| 299 | 307 | { |
|---|
| 308 | + bool reclaim = false; |
|---|
| 309 | + if (!ctx->count) |
|---|
| 310 | + return; |
|---|
| 311 | + |
|---|
| 300 | 312 | dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); |
|---|
| 301 | 313 | spin_lock(&mdsc->caps_list_lock); |
|---|
| 302 | 314 | __ceph_unreserve_caps(mdsc, ctx->count); |
|---|
| 303 | 315 | ctx->count = 0; |
|---|
| 316 | + |
|---|
| 317 | + if (mdsc->caps_use_max > 0 && |
|---|
| 318 | + mdsc->caps_use_count > mdsc->caps_use_max) |
|---|
| 319 | + reclaim = true; |
|---|
| 304 | 320 | spin_unlock(&mdsc->caps_list_lock); |
|---|
| 321 | + |
|---|
| 322 | + if (reclaim) |
|---|
| 323 | + ceph_reclaim_caps_nr(mdsc, ctx->used); |
|---|
| 305 | 324 | } |
|---|
| 306 | 325 | |
|---|
| 307 | 326 | struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, |
|---|
| .. | .. |
|---|
| 346 | 365 | BUG_ON(list_empty(&mdsc->caps_list)); |
|---|
| 347 | 366 | |
|---|
| 348 | 367 | ctx->count--; |
|---|
| 368 | + ctx->used++; |
|---|
| 349 | 369 | mdsc->caps_reserve_count--; |
|---|
| 350 | 370 | mdsc->caps_use_count++; |
|---|
| 351 | 371 | |
|---|
| .. | .. |
|---|
| 438 | 458 | } |
|---|
| 439 | 459 | |
|---|
| 440 | 460 | /* |
|---|
| 441 | | - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. |
|---|
| 442 | | - */ |
|---|
| 443 | | -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) |
|---|
| 444 | | -{ |
|---|
| 445 | | - struct ceph_cap *cap; |
|---|
| 446 | | - int mds = -1; |
|---|
| 447 | | - struct rb_node *p; |
|---|
| 448 | | - |
|---|
| 449 | | - /* prefer mds with WR|BUFFER|EXCL caps */ |
|---|
| 450 | | - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
|---|
| 451 | | - cap = rb_entry(p, struct ceph_cap, ci_node); |
|---|
| 452 | | - mds = cap->mds; |
|---|
| 453 | | - if (cap->issued & (CEPH_CAP_FILE_WR | |
|---|
| 454 | | - CEPH_CAP_FILE_BUFFER | |
|---|
| 455 | | - CEPH_CAP_FILE_EXCL)) |
|---|
| 456 | | - break; |
|---|
| 457 | | - } |
|---|
| 458 | | - return mds; |
|---|
| 459 | | -} |
|---|
| 460 | | - |
|---|
| 461 | | -int ceph_get_cap_mds(struct inode *inode) |
|---|
| 462 | | -{ |
|---|
| 463 | | - struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 464 | | - int mds; |
|---|
| 465 | | - spin_lock(&ci->i_ceph_lock); |
|---|
| 466 | | - mds = __ceph_get_cap_mds(ceph_inode(inode)); |
|---|
| 467 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 468 | | - return mds; |
|---|
| 469 | | -} |
|---|
| 470 | | - |
|---|
| 471 | | -/* |
|---|
| 472 | 461 | * Called under i_ceph_lock. |
|---|
| 473 | 462 | */ |
|---|
| 474 | 463 | static void __insert_cap_node(struct ceph_inode_info *ci, |
|---|
| .. | .. |
|---|
| 500 | 489 | static void __cap_set_timeouts(struct ceph_mds_client *mdsc, |
|---|
| 501 | 490 | struct ceph_inode_info *ci) |
|---|
| 502 | 491 | { |
|---|
| 503 | | - struct ceph_mount_options *ma = mdsc->fsc->mount_options; |
|---|
| 504 | | - |
|---|
| 505 | | - ci->i_hold_caps_min = round_jiffies(jiffies + |
|---|
| 506 | | - ma->caps_wanted_delay_min * HZ); |
|---|
| 492 | + struct ceph_mount_options *opt = mdsc->fsc->mount_options; |
|---|
| 507 | 493 | ci->i_hold_caps_max = round_jiffies(jiffies + |
|---|
| 508 | | - ma->caps_wanted_delay_max * HZ); |
|---|
| 509 | | - dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, |
|---|
| 510 | | - ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); |
|---|
| 494 | + opt->caps_wanted_delay_max * HZ); |
|---|
| 495 | + dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, |
|---|
| 496 | + ci->i_hold_caps_max - jiffies); |
|---|
| 511 | 497 | } |
|---|
| 512 | 498 | |
|---|
| 513 | 499 | /* |
|---|
| .. | .. |
|---|
| 521 | 507 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, |
|---|
| 522 | 508 | struct ceph_inode_info *ci) |
|---|
| 523 | 509 | { |
|---|
| 524 | | - __cap_set_timeouts(mdsc, ci); |
|---|
| 525 | | - dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, |
|---|
| 510 | + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, |
|---|
| 526 | 511 | ci->i_ceph_flags, ci->i_hold_caps_max); |
|---|
| 527 | 512 | if (!mdsc->stopping) { |
|---|
| 528 | 513 | spin_lock(&mdsc->cap_delay_lock); |
|---|
| .. | .. |
|---|
| 531 | 516 | goto no_change; |
|---|
| 532 | 517 | list_del_init(&ci->i_cap_delay_list); |
|---|
| 533 | 518 | } |
|---|
| 519 | + __cap_set_timeouts(mdsc, ci); |
|---|
| 534 | 520 | list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); |
|---|
| 535 | 521 | no_change: |
|---|
| 536 | 522 | spin_unlock(&mdsc->cap_delay_lock); |
|---|
| .. | .. |
|---|
| 570 | 556 | spin_unlock(&mdsc->cap_delay_lock); |
|---|
| 571 | 557 | } |
|---|
| 572 | 558 | |
|---|
| 573 | | -/* |
|---|
| 574 | | - * Common issue checks for add_cap, handle_cap_grant. |
|---|
| 575 | | - */ |
|---|
| 559 | +/* Common issue checks for add_cap, handle_cap_grant. */ |
|---|
| 576 | 560 | static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, |
|---|
| 577 | 561 | unsigned issued) |
|---|
| 578 | 562 | { |
|---|
| 579 | 563 | unsigned had = __ceph_caps_issued(ci, NULL); |
|---|
| 580 | 564 | |
|---|
| 565 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 566 | + |
|---|
| 581 | 567 | /* |
|---|
| 582 | 568 | * Each time we receive FILE_CACHE anew, we increment |
|---|
| 583 | 569 | * i_rdcache_gen. |
|---|
| 584 | 570 | */ |
|---|
| 585 | | - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
|---|
| 571 | + if (S_ISREG(ci->vfs_inode.i_mode) && |
|---|
| 572 | + (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
|---|
| 586 | 573 | (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { |
|---|
| 587 | 574 | ci->i_rdcache_gen++; |
|---|
| 588 | 575 | } |
|---|
| .. | .. |
|---|
| 601 | 588 | __ceph_dir_clear_complete(ci); |
|---|
| 602 | 589 | } |
|---|
| 603 | 590 | } |
|---|
| 591 | + |
|---|
| 592 | + /* Wipe saved layout if we're losing DIR_CREATE caps */ |
|---|
| 593 | + if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && |
|---|
| 594 | + !(issued & CEPH_CAP_DIR_CREATE)) { |
|---|
| 595 | + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); |
|---|
| 596 | + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); |
|---|
| 597 | + } |
|---|
| 598 | +} |
|---|
| 599 | + |
|---|
| 600 | +/** |
|---|
| 601 | + * change_auth_cap_ses - move inode to appropriate lists when auth caps change |
|---|
| 602 | + * @ci: inode to be moved |
|---|
| 603 | + * @session: new auth caps session |
|---|
| 604 | + */ |
|---|
| 605 | +static void change_auth_cap_ses(struct ceph_inode_info *ci, |
|---|
| 606 | + struct ceph_mds_session *session) |
|---|
| 607 | +{ |
|---|
| 608 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 609 | + |
|---|
| 610 | + if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) |
|---|
| 611 | + return; |
|---|
| 612 | + |
|---|
| 613 | + spin_lock(&session->s_mdsc->cap_dirty_lock); |
|---|
| 614 | + if (!list_empty(&ci->i_dirty_item)) |
|---|
| 615 | + list_move(&ci->i_dirty_item, &session->s_cap_dirty); |
|---|
| 616 | + if (!list_empty(&ci->i_flushing_item)) |
|---|
| 617 | + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
|---|
| 618 | + spin_unlock(&session->s_mdsc->cap_dirty_lock); |
|---|
| 604 | 619 | } |
|---|
| 605 | 620 | |
|---|
| 606 | 621 | /* |
|---|
| 607 | 622 | * Add a capability under the given MDS session. |
|---|
| 608 | 623 | * |
|---|
| 609 | | - * Caller should hold session snap_rwsem (read) and s_mutex. |
|---|
| 624 | + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock |
|---|
| 610 | 625 | * |
|---|
| 611 | 626 | * @fmode is the open file mode, if we are opening a file, otherwise |
|---|
| 612 | 627 | * it is < 0. (This is so we can atomically add the cap and add an |
|---|
| .. | .. |
|---|
| 614 | 629 | */ |
|---|
| 615 | 630 | void ceph_add_cap(struct inode *inode, |
|---|
| 616 | 631 | struct ceph_mds_session *session, u64 cap_id, |
|---|
| 617 | | - int fmode, unsigned issued, unsigned wanted, |
|---|
| 632 | + unsigned issued, unsigned wanted, |
|---|
| 618 | 633 | unsigned seq, unsigned mseq, u64 realmino, int flags, |
|---|
| 619 | 634 | struct ceph_cap **new_cap) |
|---|
| 620 | 635 | { |
|---|
| .. | .. |
|---|
| 623 | 638 | struct ceph_cap *cap; |
|---|
| 624 | 639 | int mds = session->s_mds; |
|---|
| 625 | 640 | int actual_wanted; |
|---|
| 641 | + u32 gen; |
|---|
| 642 | + |
|---|
| 643 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 626 | 644 | |
|---|
| 627 | 645 | dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, |
|---|
| 628 | 646 | session->s_mds, cap_id, ceph_cap_string(issued), seq); |
|---|
| 629 | 647 | |
|---|
| 630 | | - /* |
|---|
| 631 | | - * If we are opening the file, include file mode wanted bits |
|---|
| 632 | | - * in wanted. |
|---|
| 633 | | - */ |
|---|
| 634 | | - if (fmode >= 0) |
|---|
| 635 | | - wanted |= ceph_caps_for_mode(fmode); |
|---|
| 648 | + spin_lock(&session->s_gen_ttl_lock); |
|---|
| 649 | + gen = session->s_cap_gen; |
|---|
| 650 | + spin_unlock(&session->s_gen_ttl_lock); |
|---|
| 636 | 651 | |
|---|
| 637 | 652 | cap = __get_cap_for_mds(ci, mds); |
|---|
| 638 | 653 | if (!cap) { |
|---|
| .. | .. |
|---|
| 653 | 668 | spin_lock(&session->s_cap_lock); |
|---|
| 654 | 669 | list_add_tail(&cap->session_caps, &session->s_caps); |
|---|
| 655 | 670 | session->s_nr_caps++; |
|---|
| 671 | + atomic64_inc(&mdsc->metric.total_caps); |
|---|
| 656 | 672 | spin_unlock(&session->s_cap_lock); |
|---|
| 657 | 673 | } else { |
|---|
| 674 | + spin_lock(&session->s_cap_lock); |
|---|
| 675 | + list_move_tail(&cap->session_caps, &session->s_caps); |
|---|
| 676 | + spin_unlock(&session->s_cap_lock); |
|---|
| 677 | + |
|---|
| 678 | + if (cap->cap_gen < gen) |
|---|
| 679 | + cap->issued = cap->implemented = CEPH_CAP_PIN; |
|---|
| 680 | + |
|---|
| 658 | 681 | /* |
|---|
| 659 | 682 | * auth mds of the inode changed. we received the cap export |
|---|
| 660 | 683 | * message, but still haven't received the cap import message. |
|---|
| .. | .. |
|---|
| 726 | 749 | if (flags & CEPH_CAP_FLAG_AUTH) { |
|---|
| 727 | 750 | if (!ci->i_auth_cap || |
|---|
| 728 | 751 | ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { |
|---|
| 752 | + if (ci->i_auth_cap && |
|---|
| 753 | + ci->i_auth_cap->session != cap->session) |
|---|
| 754 | + change_auth_cap_ses(ci, cap->session); |
|---|
| 729 | 755 | ci->i_auth_cap = cap; |
|---|
| 730 | 756 | cap->mds_wanted = wanted; |
|---|
| 731 | 757 | } |
|---|
| .. | .. |
|---|
| 746 | 772 | cap->seq = seq; |
|---|
| 747 | 773 | cap->issue_seq = seq; |
|---|
| 748 | 774 | cap->mseq = mseq; |
|---|
| 749 | | - cap->cap_gen = session->s_cap_gen; |
|---|
| 750 | | - |
|---|
| 751 | | - if (fmode >= 0) |
|---|
| 752 | | - __ceph_get_fmode(ci, fmode); |
|---|
| 775 | + cap->cap_gen = gen; |
|---|
| 753 | 776 | } |
|---|
| 754 | 777 | |
|---|
| 755 | 778 | /* |
|---|
| .. | .. |
|---|
| 864 | 887 | int have = ci->i_snap_caps; |
|---|
| 865 | 888 | |
|---|
| 866 | 889 | if ((have & mask) == mask) { |
|---|
| 867 | | - dout("__ceph_caps_issued_mask %p snap issued %s" |
|---|
| 868 | | - " (mask %s)\n", &ci->vfs_inode, |
|---|
| 890 | + dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" |
|---|
| 891 | + " (mask %s)\n", ceph_ino(&ci->vfs_inode), |
|---|
| 869 | 892 | ceph_cap_string(have), |
|---|
| 870 | 893 | ceph_cap_string(mask)); |
|---|
| 871 | 894 | return 1; |
|---|
| .. | .. |
|---|
| 876 | 899 | if (!__cap_is_valid(cap)) |
|---|
| 877 | 900 | continue; |
|---|
| 878 | 901 | if ((cap->issued & mask) == mask) { |
|---|
| 879 | | - dout("__ceph_caps_issued_mask %p cap %p issued %s" |
|---|
| 880 | | - " (mask %s)\n", &ci->vfs_inode, cap, |
|---|
| 902 | + dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" |
|---|
| 903 | + " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, |
|---|
| 881 | 904 | ceph_cap_string(cap->issued), |
|---|
| 882 | 905 | ceph_cap_string(mask)); |
|---|
| 883 | 906 | if (touch) |
|---|
| .. | .. |
|---|
| 888 | 911 | /* does a combination of caps satisfy mask? */ |
|---|
| 889 | 912 | have |= cap->issued; |
|---|
| 890 | 913 | if ((have & mask) == mask) { |
|---|
| 891 | | - dout("__ceph_caps_issued_mask %p combo issued %s" |
|---|
| 892 | | - " (mask %s)\n", &ci->vfs_inode, |
|---|
| 914 | + dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" |
|---|
| 915 | + " (mask %s)\n", ceph_ino(&ci->vfs_inode), |
|---|
| 893 | 916 | ceph_cap_string(cap->issued), |
|---|
| 894 | 917 | ceph_cap_string(mask)); |
|---|
| 895 | 918 | if (touch) { |
|---|
| .. | .. |
|---|
| 903 | 926 | ci_node); |
|---|
| 904 | 927 | if (!__cap_is_valid(cap)) |
|---|
| 905 | 928 | continue; |
|---|
| 906 | | - __touch_cap(cap); |
|---|
| 929 | + if (cap->issued & mask) |
|---|
| 930 | + __touch_cap(cap); |
|---|
| 907 | 931 | } |
|---|
| 908 | 932 | } |
|---|
| 909 | 933 | return 1; |
|---|
| .. | .. |
|---|
| 911 | 935 | } |
|---|
| 912 | 936 | |
|---|
| 913 | 937 | return 0; |
|---|
| 938 | +} |
|---|
| 939 | + |
|---|
| 940 | +int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, |
|---|
| 941 | + int touch) |
|---|
| 942 | +{ |
|---|
| 943 | + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); |
|---|
| 944 | + int r; |
|---|
| 945 | + |
|---|
| 946 | + r = __ceph_caps_issued_mask(ci, mask, touch); |
|---|
| 947 | + if (r) |
|---|
| 948 | + ceph_update_cap_hit(&fsc->mdsc->metric); |
|---|
| 949 | + else |
|---|
| 950 | + ceph_update_cap_mis(&fsc->mdsc->metric); |
|---|
| 951 | + return r; |
|---|
| 914 | 952 | } |
|---|
| 915 | 953 | |
|---|
| 916 | 954 | /* |
|---|
| .. | .. |
|---|
| 952 | 990 | if (ci->i_rd_ref) |
|---|
| 953 | 991 | used |= CEPH_CAP_FILE_RD; |
|---|
| 954 | 992 | if (ci->i_rdcache_ref || |
|---|
| 955 | | - (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ |
|---|
| 993 | + (S_ISREG(ci->vfs_inode.i_mode) && |
|---|
| 956 | 994 | ci->vfs_inode.i_data.nrpages)) |
|---|
| 957 | 995 | used |= CEPH_CAP_FILE_CACHE; |
|---|
| 958 | 996 | if (ci->i_wr_ref) |
|---|
| 959 | 997 | used |= CEPH_CAP_FILE_WR; |
|---|
| 960 | 998 | if (ci->i_wb_ref || ci->i_wrbuffer_ref) |
|---|
| 961 | 999 | used |= CEPH_CAP_FILE_BUFFER; |
|---|
| 1000 | + if (ci->i_fx_ref) |
|---|
| 1001 | + used |= CEPH_CAP_FILE_EXCL; |
|---|
| 962 | 1002 | return used; |
|---|
| 963 | 1003 | } |
|---|
| 1004 | + |
|---|
| 1005 | +#define FMODE_WAIT_BIAS 1000 |
|---|
| 964 | 1006 | |
|---|
| 965 | 1007 | /* |
|---|
| 966 | 1008 | * wanted, by virtue of open file modes |
|---|
| 967 | 1009 | */ |
|---|
| 968 | 1010 | int __ceph_caps_file_wanted(struct ceph_inode_info *ci) |
|---|
| 969 | 1011 | { |
|---|
| 970 | | - int i, bits = 0; |
|---|
| 971 | | - for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { |
|---|
| 972 | | - if (ci->i_nr_by_mode[i]) |
|---|
| 973 | | - bits |= 1 << i; |
|---|
| 1012 | + const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); |
|---|
| 1013 | + const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); |
|---|
| 1014 | + const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); |
|---|
| 1015 | + const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); |
|---|
| 1016 | + struct ceph_mount_options *opt = |
|---|
| 1017 | + ceph_inode_to_client(&ci->vfs_inode)->mount_options; |
|---|
| 1018 | + unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; |
|---|
| 1019 | + unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; |
|---|
| 1020 | + |
|---|
| 1021 | + if (S_ISDIR(ci->vfs_inode.i_mode)) { |
|---|
| 1022 | + int want = 0; |
|---|
| 1023 | + |
|---|
| 1024 | + /* use used_cutoff here, to keep dir's wanted caps longer */ |
|---|
| 1025 | + if (ci->i_nr_by_mode[RD_SHIFT] > 0 || |
|---|
| 1026 | + time_after(ci->i_last_rd, used_cutoff)) |
|---|
| 1027 | + want |= CEPH_CAP_ANY_SHARED; |
|---|
| 1028 | + |
|---|
| 1029 | + if (ci->i_nr_by_mode[WR_SHIFT] > 0 || |
|---|
| 1030 | + time_after(ci->i_last_wr, used_cutoff)) { |
|---|
| 1031 | + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; |
|---|
| 1032 | + if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) |
|---|
| 1033 | + want |= CEPH_CAP_ANY_DIR_OPS; |
|---|
| 1034 | + } |
|---|
| 1035 | + |
|---|
| 1036 | + if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) |
|---|
| 1037 | + want |= CEPH_CAP_PIN; |
|---|
| 1038 | + |
|---|
| 1039 | + return want; |
|---|
| 1040 | + } else { |
|---|
| 1041 | + int bits = 0; |
|---|
| 1042 | + |
|---|
| 1043 | + if (ci->i_nr_by_mode[RD_SHIFT] > 0) { |
|---|
| 1044 | + if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || |
|---|
| 1045 | + time_after(ci->i_last_rd, used_cutoff)) |
|---|
| 1046 | + bits |= 1 << RD_SHIFT; |
|---|
| 1047 | + } else if (time_after(ci->i_last_rd, idle_cutoff)) { |
|---|
| 1048 | + bits |= 1 << RD_SHIFT; |
|---|
| 1049 | + } |
|---|
| 1050 | + |
|---|
| 1051 | + if (ci->i_nr_by_mode[WR_SHIFT] > 0) { |
|---|
| 1052 | + if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || |
|---|
| 1053 | + time_after(ci->i_last_wr, used_cutoff)) |
|---|
| 1054 | + bits |= 1 << WR_SHIFT; |
|---|
| 1055 | + } else if (time_after(ci->i_last_wr, idle_cutoff)) { |
|---|
| 1056 | + bits |= 1 << WR_SHIFT; |
|---|
| 1057 | + } |
|---|
| 1058 | + |
|---|
| 1059 | + /* check lazyio only when read/write is wanted */ |
|---|
| 1060 | + if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && |
|---|
| 1061 | + ci->i_nr_by_mode[LAZY_SHIFT] > 0) |
|---|
| 1062 | + bits |= 1 << LAZY_SHIFT; |
|---|
| 1063 | + |
|---|
| 1064 | + return bits ? ceph_caps_for_mode(bits >> 1) : 0; |
|---|
| 974 | 1065 | } |
|---|
| 975 | | - if (bits == 0) |
|---|
| 976 | | - return 0; |
|---|
| 977 | | - return ceph_caps_for_mode(bits >> 1); |
|---|
| 1066 | +} |
|---|
| 1067 | + |
|---|
| 1068 | +/* |
|---|
| 1069 | + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) |
|---|
| 1070 | + */ |
|---|
| 1071 | +int __ceph_caps_wanted(struct ceph_inode_info *ci) |
|---|
| 1072 | +{ |
|---|
| 1073 | + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); |
|---|
| 1074 | + if (S_ISDIR(ci->vfs_inode.i_mode)) { |
|---|
| 1075 | + /* we want EXCL if holding caps of dir ops */ |
|---|
| 1076 | + if (w & CEPH_CAP_ANY_DIR_OPS) |
|---|
| 1077 | + w |= CEPH_CAP_FILE_EXCL; |
|---|
| 1078 | + } else { |
|---|
| 1079 | + /* we want EXCL if dirty data */ |
|---|
| 1080 | + if (w & CEPH_CAP_FILE_BUFFER) |
|---|
| 1081 | + w |= CEPH_CAP_FILE_EXCL; |
|---|
| 1082 | + } |
|---|
| 1083 | + return w; |
|---|
| 978 | 1084 | } |
|---|
| 979 | 1085 | |
|---|
| 980 | 1086 | /* |
|---|
| .. | .. |
|---|
| 998 | 1104 | return mds_wanted; |
|---|
| 999 | 1105 | } |
|---|
| 1000 | 1106 | |
|---|
| 1001 | | -/* |
|---|
| 1002 | | - * called under i_ceph_lock |
|---|
| 1003 | | - */ |
|---|
| 1004 | | -static int __ceph_is_single_caps(struct ceph_inode_info *ci) |
|---|
| 1005 | | -{ |
|---|
| 1006 | | - return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); |
|---|
| 1007 | | -} |
|---|
| 1008 | | - |
|---|
| 1009 | | -static int __ceph_is_any_caps(struct ceph_inode_info *ci) |
|---|
| 1010 | | -{ |
|---|
| 1011 | | - return !RB_EMPTY_ROOT(&ci->i_caps); |
|---|
| 1012 | | -} |
|---|
| 1013 | | - |
|---|
| 1014 | 1107 | int ceph_is_any_caps(struct inode *inode) |
|---|
| 1015 | 1108 | { |
|---|
| 1016 | 1109 | struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 1017 | 1110 | int ret; |
|---|
| 1018 | 1111 | |
|---|
| 1019 | 1112 | spin_lock(&ci->i_ceph_lock); |
|---|
| 1020 | | - ret = __ceph_is_any_caps(ci); |
|---|
| 1113 | + ret = __ceph_is_any_real_caps(ci); |
|---|
| 1021 | 1114 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 1022 | 1115 | |
|---|
| 1023 | 1116 | return ret; |
|---|
| .. | .. |
|---|
| 1062 | 1155 | |
|---|
| 1063 | 1156 | /* remove from inode's cap rbtree, and clear auth cap */ |
|---|
| 1064 | 1157 | rb_erase(&cap->ci_node, &ci->i_caps); |
|---|
| 1065 | | - if (ci->i_auth_cap == cap) |
|---|
| 1158 | + if (ci->i_auth_cap == cap) { |
|---|
| 1159 | + WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); |
|---|
| 1066 | 1160 | ci->i_auth_cap = NULL; |
|---|
| 1161 | + } |
|---|
| 1067 | 1162 | |
|---|
| 1068 | 1163 | /* remove from session list */ |
|---|
| 1069 | 1164 | spin_lock(&session->s_cap_lock); |
|---|
| .. | .. |
|---|
| 1074 | 1169 | } else { |
|---|
| 1075 | 1170 | list_del_init(&cap->session_caps); |
|---|
| 1076 | 1171 | session->s_nr_caps--; |
|---|
| 1172 | + atomic64_dec(&mdsc->metric.total_caps); |
|---|
| 1077 | 1173 | cap->session = NULL; |
|---|
| 1078 | 1174 | removed = 1; |
|---|
| 1079 | 1175 | } |
|---|
| .. | .. |
|---|
| 1088 | 1184 | (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { |
|---|
| 1089 | 1185 | cap->queue_release = 1; |
|---|
| 1090 | 1186 | if (removed) { |
|---|
| 1091 | | - list_add_tail(&cap->session_caps, |
|---|
| 1092 | | - &session->s_cap_releases); |
|---|
| 1093 | | - session->s_num_cap_releases++; |
|---|
| 1187 | + __ceph_queue_cap_release(session, cap); |
|---|
| 1094 | 1188 | removed = 0; |
|---|
| 1095 | 1189 | } |
|---|
| 1096 | 1190 | } else { |
|---|
| .. | .. |
|---|
| 1103 | 1197 | if (removed) |
|---|
| 1104 | 1198 | ceph_put_cap(mdsc, cap); |
|---|
| 1105 | 1199 | |
|---|
| 1106 | | - /* when reconnect denied, we remove session caps forcibly, |
|---|
| 1107 | | - * i_wr_ref can be non-zero. If there are ongoing write, |
|---|
| 1108 | | - * keep i_snap_realm. |
|---|
| 1109 | | - */ |
|---|
| 1110 | | - if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) |
|---|
| 1111 | | - drop_inode_snap_realm(ci); |
|---|
| 1200 | + if (!__ceph_is_any_real_caps(ci)) { |
|---|
| 1201 | + /* when reconnect denied, we remove session caps forcibly, |
|---|
| 1202 | + * i_wr_ref can be non-zero. If there are ongoing write, |
|---|
| 1203 | + * keep i_snap_realm. |
|---|
| 1204 | + */ |
|---|
| 1205 | + if (ci->i_wr_ref == 0 && ci->i_snap_realm) |
|---|
| 1206 | + drop_inode_snap_realm(ci); |
|---|
| 1112 | 1207 | |
|---|
| 1113 | | - if (!__ceph_is_any_real_caps(ci)) |
|---|
| 1114 | 1208 | __cap_delay_cancel(mdsc, ci); |
|---|
| 1209 | + } |
|---|
| 1115 | 1210 | } |
|---|
| 1116 | 1211 | |
|---|
| 1117 | 1212 | struct cap_msg_args { |
|---|
| .. | .. |
|---|
| 1119 | 1214 | u64 ino, cid, follows; |
|---|
| 1120 | 1215 | u64 flush_tid, oldest_flush_tid, size, max_size; |
|---|
| 1121 | 1216 | u64 xattr_version; |
|---|
| 1217 | + u64 change_attr; |
|---|
| 1122 | 1218 | struct ceph_buffer *xattr_buf; |
|---|
| 1123 | | - struct timespec64 atime, mtime, ctime; |
|---|
| 1219 | + struct ceph_buffer *old_xattr_buf; |
|---|
| 1220 | + struct timespec64 atime, mtime, ctime, btime; |
|---|
| 1124 | 1221 | int op, caps, wanted, dirty; |
|---|
| 1125 | 1222 | u32 seq, issue_seq, mseq, time_warp_seq; |
|---|
| 1126 | 1223 | u32 flags; |
|---|
| .. | .. |
|---|
| 1128 | 1225 | kgid_t gid; |
|---|
| 1129 | 1226 | umode_t mode; |
|---|
| 1130 | 1227 | bool inline_data; |
|---|
| 1228 | + bool wake; |
|---|
| 1131 | 1229 | }; |
|---|
| 1132 | 1230 | |
|---|
| 1133 | 1231 | /* |
|---|
| 1134 | | - * Build and send a cap message to the given MDS. |
|---|
| 1135 | | - * |
|---|
| 1136 | | - * Caller should be holding s_mutex. |
|---|
| 1232 | + * cap struct size + flock buffer size + inline version + inline data size + |
|---|
| 1233 | + * osd_epoch_barrier + oldest_flush_tid |
|---|
| 1137 | 1234 | */ |
|---|
| 1138 | | -static int send_cap_msg(struct cap_msg_args *arg) |
|---|
| 1235 | +#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ |
|---|
| 1236 | + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) |
|---|
| 1237 | + |
|---|
| 1238 | +/* Marshal up the cap msg to the MDS */ |
|---|
| 1239 | +static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) |
|---|
| 1139 | 1240 | { |
|---|
| 1140 | 1241 | struct ceph_mds_caps *fc; |
|---|
| 1141 | | - struct ceph_msg *msg; |
|---|
| 1142 | 1242 | void *p; |
|---|
| 1143 | | - size_t extra_len; |
|---|
| 1144 | | - struct timespec64 zerotime = {0}; |
|---|
| 1145 | 1243 | struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; |
|---|
| 1146 | 1244 | |
|---|
| 1147 | | - dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" |
|---|
| 1148 | | - " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" |
|---|
| 1149 | | - " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), |
|---|
| 1150 | | - arg->cid, arg->ino, ceph_cap_string(arg->caps), |
|---|
| 1151 | | - ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), |
|---|
| 1152 | | - arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, |
|---|
| 1153 | | - arg->mseq, arg->follows, arg->size, arg->max_size, |
|---|
| 1154 | | - arg->xattr_version, |
|---|
| 1245 | + dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", |
|---|
| 1246 | + __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, |
|---|
| 1247 | + ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), |
|---|
| 1248 | + ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, |
|---|
| 1249 | + arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, |
|---|
| 1250 | + arg->size, arg->max_size, arg->xattr_version, |
|---|
| 1155 | 1251 | arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); |
|---|
| 1156 | | - |
|---|
| 1157 | | - /* flock buffer size + inline version + inline data size + |
|---|
| 1158 | | - * osd_epoch_barrier + oldest_flush_tid */ |
|---|
| 1159 | | - extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4; |
|---|
| 1160 | | - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, |
|---|
| 1161 | | - GFP_NOFS, false); |
|---|
| 1162 | | - if (!msg) |
|---|
| 1163 | | - return -ENOMEM; |
|---|
| 1164 | 1252 | |
|---|
| 1165 | 1253 | msg->hdr.version = cpu_to_le16(10); |
|---|
| 1166 | 1254 | msg->hdr.tid = cpu_to_le64(arg->flush_tid); |
|---|
| .. | .. |
|---|
| 1226 | 1314 | /* pool namespace (version 8) (mds always ignores this) */ |
|---|
| 1227 | 1315 | ceph_encode_32(&p, 0); |
|---|
| 1228 | 1316 | |
|---|
| 1229 | | - /* |
|---|
| 1230 | | - * btime and change_attr (version 9) |
|---|
| 1231 | | - * |
|---|
| 1232 | | - * We just zero these out for now, as the MDS ignores them unless |
|---|
| 1233 | | - * the requisite feature flags are set (which we don't do yet). |
|---|
| 1234 | | - */ |
|---|
| 1235 | | - ceph_encode_timespec64(p, &zerotime); |
|---|
| 1317 | + /* btime and change_attr (version 9) */ |
|---|
| 1318 | + ceph_encode_timespec64(p, &arg->btime); |
|---|
| 1236 | 1319 | p += sizeof(struct ceph_timespec); |
|---|
| 1237 | | - ceph_encode_64(&p, 0); |
|---|
| 1320 | + ceph_encode_64(&p, arg->change_attr); |
|---|
| 1238 | 1321 | |
|---|
| 1239 | 1322 | /* Advisory flags (version 10) */ |
|---|
| 1240 | 1323 | ceph_encode_32(&p, arg->flags); |
|---|
| 1241 | | - |
|---|
| 1242 | | - ceph_con_send(&arg->session->s_con, msg); |
|---|
| 1243 | | - return 0; |
|---|
| 1244 | 1324 | } |
|---|
| 1245 | 1325 | |
|---|
| 1246 | 1326 | /* |
|---|
| 1247 | 1327 | * Queue cap releases when an inode is dropped from our cache. |
|---|
| 1248 | 1328 | */ |
|---|
| 1249 | | -void ceph_queue_caps_release(struct inode *inode) |
|---|
| 1329 | +void __ceph_remove_caps(struct ceph_inode_info *ci) |
|---|
| 1250 | 1330 | { |
|---|
| 1251 | | - struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 1252 | 1331 | struct rb_node *p; |
|---|
| 1253 | 1332 | |
|---|
| 1254 | 1333 | /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) |
|---|
| .. | .. |
|---|
| 1264 | 1343 | } |
|---|
| 1265 | 1344 | |
|---|
| 1266 | 1345 | /* |
|---|
| 1267 | | - * Send a cap msg on the given inode. Update our caps state, then |
|---|
| 1268 | | - * drop i_ceph_lock and send the message. |
|---|
| 1346 | + * Prepare to send a cap message to an MDS. Update the cap state, and populate |
|---|
| 1347 | + * the arg struct with the parameters that will need to be sent. This should |
|---|
| 1348 | + * be done under the i_ceph_lock to guard against changes to cap state. |
|---|
| 1269 | 1349 | * |
|---|
| 1270 | 1350 | * Make note of max_size reported/requested from mds, revoked caps |
|---|
| 1271 | 1351 | * that have now been implemented. |
|---|
| 1272 | | - * |
|---|
| 1273 | | - * Make half-hearted attempt ot to invalidate page cache if we are |
|---|
| 1274 | | - * dropping RDCACHE. Note that this will leave behind locked pages |
|---|
| 1275 | | - * that we'll then need to deal with elsewhere. |
|---|
| 1276 | | - * |
|---|
| 1277 | | - * Return non-zero if delayed release, or we experienced an error |
|---|
| 1278 | | - * such that the caller should requeue + retry later. |
|---|
| 1279 | | - * |
|---|
| 1280 | | - * called with i_ceph_lock, then drops it. |
|---|
| 1281 | | - * caller should hold snap_rwsem (read), s_mutex. |
|---|
| 1282 | 1352 | */ |
|---|
| 1283 | | -static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, |
|---|
| 1284 | | - int op, bool sync, int used, int want, int retain, |
|---|
| 1285 | | - int flushing, u64 flush_tid, u64 oldest_flush_tid) |
|---|
| 1286 | | - __releases(cap->ci->i_ceph_lock) |
|---|
| 1353 | +static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, |
|---|
| 1354 | + int op, int flags, int used, int want, int retain, |
|---|
| 1355 | + int flushing, u64 flush_tid, u64 oldest_flush_tid) |
|---|
| 1287 | 1356 | { |
|---|
| 1288 | 1357 | struct ceph_inode_info *ci = cap->ci; |
|---|
| 1289 | 1358 | struct inode *inode = &ci->vfs_inode; |
|---|
| 1290 | | - struct ceph_buffer *old_blob = NULL; |
|---|
| 1291 | | - struct cap_msg_args arg; |
|---|
| 1292 | 1359 | int held, revoking; |
|---|
| 1293 | | - int wake = 0; |
|---|
| 1294 | | - int delayed = 0; |
|---|
| 1295 | | - int ret; |
|---|
| 1360 | + |
|---|
| 1361 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 1296 | 1362 | |
|---|
| 1297 | 1363 | held = cap->issued | cap->implemented; |
|---|
| 1298 | 1364 | revoking = cap->implemented & ~cap->issued; |
|---|
| 1299 | 1365 | retain &= ~revoking; |
|---|
| 1300 | 1366 | |
|---|
| 1301 | | - dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", |
|---|
| 1302 | | - inode, cap, cap->session, |
|---|
| 1367 | + dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", |
|---|
| 1368 | + __func__, inode, cap, cap->session, |
|---|
| 1303 | 1369 | ceph_cap_string(held), ceph_cap_string(held & retain), |
|---|
| 1304 | 1370 | ceph_cap_string(revoking)); |
|---|
| 1305 | 1371 | BUG_ON((retain & CEPH_CAP_PIN) == 0); |
|---|
| 1306 | 1372 | |
|---|
| 1307 | | - arg.session = cap->session; |
|---|
| 1308 | | - |
|---|
| 1309 | | - /* don't release wanted unless we've waited a bit. */ |
|---|
| 1310 | | - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && |
|---|
| 1311 | | - time_before(jiffies, ci->i_hold_caps_min)) { |
|---|
| 1312 | | - dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", |
|---|
| 1313 | | - ceph_cap_string(cap->issued), |
|---|
| 1314 | | - ceph_cap_string(cap->issued & retain), |
|---|
| 1315 | | - ceph_cap_string(cap->mds_wanted), |
|---|
| 1316 | | - ceph_cap_string(want)); |
|---|
| 1317 | | - want |= cap->mds_wanted; |
|---|
| 1318 | | - retain |= cap->issued; |
|---|
| 1319 | | - delayed = 1; |
|---|
| 1320 | | - } |
|---|
| 1321 | | - ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); |
|---|
| 1322 | | - if (want & ~cap->mds_wanted) { |
|---|
| 1323 | | - /* user space may open/close single file frequently. |
|---|
| 1324 | | - * This avoids droping mds_wanted immediately after |
|---|
| 1325 | | - * requesting new mds_wanted. |
|---|
| 1326 | | - */ |
|---|
| 1327 | | - __cap_set_timeouts(mdsc, ci); |
|---|
| 1328 | | - } |
|---|
| 1373 | + ci->i_ceph_flags &= ~CEPH_I_FLUSH; |
|---|
| 1329 | 1374 | |
|---|
| 1330 | 1375 | cap->issued &= retain; /* drop bits we don't want */ |
|---|
| 1331 | | - if (cap->implemented & ~cap->issued) { |
|---|
| 1332 | | - /* |
|---|
| 1333 | | - * Wake up any waiters on wanted -> needed transition. |
|---|
| 1334 | | - * This is due to the weird transition from buffered |
|---|
| 1335 | | - * to sync IO... we need to flush dirty pages _before_ |
|---|
| 1336 | | - * allowing sync writes to avoid reordering. |
|---|
| 1337 | | - */ |
|---|
| 1338 | | - wake = 1; |
|---|
| 1339 | | - } |
|---|
| 1376 | + /* |
|---|
| 1377 | + * Wake up any waiters on wanted -> needed transition. This is due to |
|---|
| 1378 | + * the weird transition from buffered to sync IO... we need to flush |
|---|
| 1379 | + * dirty pages _before_ allowing sync writes to avoid reordering. |
|---|
| 1380 | + */ |
|---|
| 1381 | + arg->wake = cap->implemented & ~cap->issued; |
|---|
| 1340 | 1382 | cap->implemented &= cap->issued | used; |
|---|
| 1341 | 1383 | cap->mds_wanted = want; |
|---|
| 1342 | 1384 | |
|---|
| 1343 | | - arg.ino = ceph_vino(inode).ino; |
|---|
| 1344 | | - arg.cid = cap->cap_id; |
|---|
| 1345 | | - arg.follows = flushing ? ci->i_head_snapc->seq : 0; |
|---|
| 1346 | | - arg.flush_tid = flush_tid; |
|---|
| 1347 | | - arg.oldest_flush_tid = oldest_flush_tid; |
|---|
| 1385 | + arg->session = cap->session; |
|---|
| 1386 | + arg->ino = ceph_vino(inode).ino; |
|---|
| 1387 | + arg->cid = cap->cap_id; |
|---|
| 1388 | + arg->follows = flushing ? ci->i_head_snapc->seq : 0; |
|---|
| 1389 | + arg->flush_tid = flush_tid; |
|---|
| 1390 | + arg->oldest_flush_tid = oldest_flush_tid; |
|---|
| 1348 | 1391 | |
|---|
| 1349 | | - arg.size = inode->i_size; |
|---|
| 1350 | | - ci->i_reported_size = arg.size; |
|---|
| 1351 | | - arg.max_size = ci->i_wanted_max_size; |
|---|
| 1352 | | - ci->i_requested_max_size = arg.max_size; |
|---|
| 1392 | + arg->size = inode->i_size; |
|---|
| 1393 | + ci->i_reported_size = arg->size; |
|---|
| 1394 | + arg->max_size = ci->i_wanted_max_size; |
|---|
| 1395 | + if (cap == ci->i_auth_cap) { |
|---|
| 1396 | + if (want & CEPH_CAP_ANY_FILE_WR) |
|---|
| 1397 | + ci->i_requested_max_size = arg->max_size; |
|---|
| 1398 | + else |
|---|
| 1399 | + ci->i_requested_max_size = 0; |
|---|
| 1400 | + } |
|---|
| 1353 | 1401 | |
|---|
| 1354 | 1402 | if (flushing & CEPH_CAP_XATTR_EXCL) { |
|---|
| 1355 | | - old_blob = __ceph_build_xattrs_blob(ci); |
|---|
| 1356 | | - arg.xattr_version = ci->i_xattrs.version; |
|---|
| 1357 | | - arg.xattr_buf = ci->i_xattrs.blob; |
|---|
| 1403 | + arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); |
|---|
| 1404 | + arg->xattr_version = ci->i_xattrs.version; |
|---|
| 1405 | + arg->xattr_buf = ci->i_xattrs.blob; |
|---|
| 1358 | 1406 | } else { |
|---|
| 1359 | | - arg.xattr_buf = NULL; |
|---|
| 1407 | + arg->xattr_buf = NULL; |
|---|
| 1408 | + arg->old_xattr_buf = NULL; |
|---|
| 1360 | 1409 | } |
|---|
| 1361 | 1410 | |
|---|
| 1362 | | - arg.mtime = inode->i_mtime; |
|---|
| 1363 | | - arg.atime = inode->i_atime; |
|---|
| 1364 | | - arg.ctime = inode->i_ctime; |
|---|
| 1411 | + arg->mtime = inode->i_mtime; |
|---|
| 1412 | + arg->atime = inode->i_atime; |
|---|
| 1413 | + arg->ctime = inode->i_ctime; |
|---|
| 1414 | + arg->btime = ci->i_btime; |
|---|
| 1415 | + arg->change_attr = inode_peek_iversion_raw(inode); |
|---|
| 1365 | 1416 | |
|---|
| 1366 | | - arg.op = op; |
|---|
| 1367 | | - arg.caps = cap->implemented; |
|---|
| 1368 | | - arg.wanted = want; |
|---|
| 1369 | | - arg.dirty = flushing; |
|---|
| 1417 | + arg->op = op; |
|---|
| 1418 | + arg->caps = cap->implemented; |
|---|
| 1419 | + arg->wanted = want; |
|---|
| 1420 | + arg->dirty = flushing; |
|---|
| 1370 | 1421 | |
|---|
| 1371 | | - arg.seq = cap->seq; |
|---|
| 1372 | | - arg.issue_seq = cap->issue_seq; |
|---|
| 1373 | | - arg.mseq = cap->mseq; |
|---|
| 1374 | | - arg.time_warp_seq = ci->i_time_warp_seq; |
|---|
| 1422 | + arg->seq = cap->seq; |
|---|
| 1423 | + arg->issue_seq = cap->issue_seq; |
|---|
| 1424 | + arg->mseq = cap->mseq; |
|---|
| 1425 | + arg->time_warp_seq = ci->i_time_warp_seq; |
|---|
| 1375 | 1426 | |
|---|
| 1376 | | - arg.uid = inode->i_uid; |
|---|
| 1377 | | - arg.gid = inode->i_gid; |
|---|
| 1378 | | - arg.mode = inode->i_mode; |
|---|
| 1427 | + arg->uid = inode->i_uid; |
|---|
| 1428 | + arg->gid = inode->i_gid; |
|---|
| 1429 | + arg->mode = inode->i_mode; |
|---|
| 1379 | 1430 | |
|---|
| 1380 | | - arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; |
|---|
| 1381 | | - if (list_empty(&ci->i_cap_snaps)) |
|---|
| 1382 | | - arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; |
|---|
| 1383 | | - else |
|---|
| 1384 | | - arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; |
|---|
| 1385 | | - if (sync) |
|---|
| 1386 | | - arg.flags |= CEPH_CLIENT_CAPS_SYNC; |
|---|
| 1431 | + arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; |
|---|
| 1432 | + if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && |
|---|
| 1433 | + !list_empty(&ci->i_cap_snaps)) { |
|---|
| 1434 | + struct ceph_cap_snap *capsnap; |
|---|
| 1435 | + list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { |
|---|
| 1436 | + if (capsnap->cap_flush.tid) |
|---|
| 1437 | + break; |
|---|
| 1438 | + if (capsnap->need_flush) { |
|---|
| 1439 | + flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; |
|---|
| 1440 | + break; |
|---|
| 1441 | + } |
|---|
| 1442 | + } |
|---|
| 1443 | + } |
|---|
| 1444 | + arg->flags = flags; |
|---|
| 1445 | +} |
|---|
| 1387 | 1446 | |
|---|
| 1388 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 1447 | +/* |
|---|
| 1448 | + * Send a cap msg on the given inode. |
|---|
| 1449 | + * |
|---|
| 1450 | + * Caller should hold snap_rwsem (read), s_mutex. |
|---|
| 1451 | + */ |
|---|
| 1452 | +static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) |
|---|
| 1453 | +{ |
|---|
| 1454 | + struct ceph_msg *msg; |
|---|
| 1455 | + struct inode *inode = &ci->vfs_inode; |
|---|
| 1389 | 1456 | |
|---|
| 1390 | | - ceph_buffer_put(old_blob); |
|---|
| 1391 | | - |
|---|
| 1392 | | - ret = send_cap_msg(&arg); |
|---|
| 1393 | | - if (ret < 0) { |
|---|
| 1394 | | - dout("error sending cap msg, must requeue %p\n", inode); |
|---|
| 1395 | | - delayed = 1; |
|---|
| 1457 | + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); |
|---|
| 1458 | + if (!msg) { |
|---|
| 1459 | + pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", |
|---|
| 1460 | + ceph_vinop(inode), ceph_cap_string(arg->dirty), |
|---|
| 1461 | + arg->flush_tid); |
|---|
| 1462 | + spin_lock(&ci->i_ceph_lock); |
|---|
| 1463 | + __cap_delay_requeue(arg->session->s_mdsc, ci); |
|---|
| 1464 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 1465 | + return; |
|---|
| 1396 | 1466 | } |
|---|
| 1397 | 1467 | |
|---|
| 1398 | | - if (wake) |
|---|
| 1468 | + encode_cap_msg(msg, arg); |
|---|
| 1469 | + ceph_con_send(&arg->session->s_con, msg); |
|---|
| 1470 | + ceph_buffer_put(arg->old_xattr_buf); |
|---|
| 1471 | + if (arg->wake) |
|---|
| 1399 | 1472 | wake_up_all(&ci->i_cap_wq); |
|---|
| 1400 | | - |
|---|
| 1401 | | - return delayed; |
|---|
| 1402 | 1473 | } |
|---|
| 1403 | 1474 | |
|---|
| 1404 | 1475 | static inline int __send_flush_snap(struct inode *inode, |
|---|
| .. | .. |
|---|
| 1407 | 1478 | u32 mseq, u64 oldest_flush_tid) |
|---|
| 1408 | 1479 | { |
|---|
| 1409 | 1480 | struct cap_msg_args arg; |
|---|
| 1481 | + struct ceph_msg *msg; |
|---|
| 1482 | + |
|---|
| 1483 | + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); |
|---|
| 1484 | + if (!msg) |
|---|
| 1485 | + return -ENOMEM; |
|---|
| 1410 | 1486 | |
|---|
| 1411 | 1487 | arg.session = session; |
|---|
| 1412 | 1488 | arg.ino = ceph_vino(inode).ino; |
|---|
| .. | .. |
|---|
| 1419 | 1495 | arg.max_size = 0; |
|---|
| 1420 | 1496 | arg.xattr_version = capsnap->xattr_version; |
|---|
| 1421 | 1497 | arg.xattr_buf = capsnap->xattr_blob; |
|---|
| 1498 | + arg.old_xattr_buf = NULL; |
|---|
| 1422 | 1499 | |
|---|
| 1423 | 1500 | arg.atime = capsnap->atime; |
|---|
| 1424 | 1501 | arg.mtime = capsnap->mtime; |
|---|
| 1425 | 1502 | arg.ctime = capsnap->ctime; |
|---|
| 1503 | + arg.btime = capsnap->btime; |
|---|
| 1504 | + arg.change_attr = capsnap->change_attr; |
|---|
| 1426 | 1505 | |
|---|
| 1427 | 1506 | arg.op = CEPH_CAP_OP_FLUSHSNAP; |
|---|
| 1428 | 1507 | arg.caps = capsnap->issued; |
|---|
| .. | .. |
|---|
| 1440 | 1519 | |
|---|
| 1441 | 1520 | arg.inline_data = capsnap->inline_data; |
|---|
| 1442 | 1521 | arg.flags = 0; |
|---|
| 1522 | + arg.wake = false; |
|---|
| 1443 | 1523 | |
|---|
| 1444 | | - return send_cap_msg(&arg); |
|---|
| 1524 | + encode_cap_msg(msg, &arg); |
|---|
| 1525 | + ceph_con_send(&arg.session->s_con, msg); |
|---|
| 1526 | + return 0; |
|---|
| 1445 | 1527 | } |
|---|
| 1446 | 1528 | |
|---|
| 1447 | 1529 | /* |
|---|
| .. | .. |
|---|
| 1554 | 1636 | struct inode *inode = &ci->vfs_inode; |
|---|
| 1555 | 1637 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
|---|
| 1556 | 1638 | struct ceph_mds_session *session = NULL; |
|---|
| 1639 | + bool need_put = false; |
|---|
| 1557 | 1640 | int mds; |
|---|
| 1558 | 1641 | |
|---|
| 1559 | 1642 | dout("ceph_flush_snaps %p\n", inode); |
|---|
| .. | .. |
|---|
| 1590 | 1673 | } |
|---|
| 1591 | 1674 | |
|---|
| 1592 | 1675 | // make sure flushsnap messages are sent in proper order. |
|---|
| 1593 | | - if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { |
|---|
| 1676 | + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) |
|---|
| 1594 | 1677 | __kick_flushing_caps(mdsc, session, ci, 0); |
|---|
| 1595 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
|---|
| 1596 | | - } |
|---|
| 1597 | 1678 | |
|---|
| 1598 | 1679 | __ceph_flush_snaps(ci, session); |
|---|
| 1599 | 1680 | out: |
|---|
| .. | .. |
|---|
| 1607 | 1688 | } |
|---|
| 1608 | 1689 | /* we flushed them all; remove this inode from the queue */ |
|---|
| 1609 | 1690 | spin_lock(&mdsc->snap_flush_lock); |
|---|
| 1691 | + if (!list_empty(&ci->i_snap_flush_item)) |
|---|
| 1692 | + need_put = true; |
|---|
| 1610 | 1693 | list_del_init(&ci->i_snap_flush_item); |
|---|
| 1611 | 1694 | spin_unlock(&mdsc->snap_flush_lock); |
|---|
| 1695 | + |
|---|
| 1696 | + if (need_put) |
|---|
| 1697 | + iput(inode); |
|---|
| 1612 | 1698 | } |
|---|
| 1613 | 1699 | |
|---|
| 1614 | 1700 | /* |
|---|
| .. | .. |
|---|
| 1625 | 1711 | int was = ci->i_dirty_caps; |
|---|
| 1626 | 1712 | int dirty = 0; |
|---|
| 1627 | 1713 | |
|---|
| 1714 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 1715 | + |
|---|
| 1628 | 1716 | if (!ci->i_auth_cap) { |
|---|
| 1629 | 1717 | pr_warn("__mark_dirty_caps %p %llx mask %s, " |
|---|
| 1630 | 1718 | "but no auth cap (session was closed?)\n", |
|---|
| .. | .. |
|---|
| 1637 | 1725 | ceph_cap_string(was | mask)); |
|---|
| 1638 | 1726 | ci->i_dirty_caps |= mask; |
|---|
| 1639 | 1727 | if (was == 0) { |
|---|
| 1728 | + struct ceph_mds_session *session = ci->i_auth_cap->session; |
|---|
| 1729 | + |
|---|
| 1640 | 1730 | WARN_ON_ONCE(ci->i_prealloc_cap_flush); |
|---|
| 1641 | 1731 | swap(ci->i_prealloc_cap_flush, *pcf); |
|---|
| 1642 | 1732 | |
|---|
| .. | .. |
|---|
| 1649 | 1739 | &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); |
|---|
| 1650 | 1740 | BUG_ON(!list_empty(&ci->i_dirty_item)); |
|---|
| 1651 | 1741 | spin_lock(&mdsc->cap_dirty_lock); |
|---|
| 1652 | | - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); |
|---|
| 1742 | + list_add(&ci->i_dirty_item, &session->s_cap_dirty); |
|---|
| 1653 | 1743 | spin_unlock(&mdsc->cap_dirty_lock); |
|---|
| 1654 | 1744 | if (ci->i_flushing_caps == 0) { |
|---|
| 1655 | 1745 | ihold(inode); |
|---|
| .. | .. |
|---|
| 1668 | 1758 | |
|---|
| 1669 | 1759 | struct ceph_cap_flush *ceph_alloc_cap_flush(void) |
|---|
| 1670 | 1760 | { |
|---|
| 1671 | | - return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); |
|---|
| 1761 | + struct ceph_cap_flush *cf; |
|---|
| 1762 | + |
|---|
| 1763 | + cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); |
|---|
| 1764 | + if (!cf) |
|---|
| 1765 | + return NULL; |
|---|
| 1766 | + |
|---|
| 1767 | + cf->is_capsnap = false; |
|---|
| 1768 | + return cf; |
|---|
| 1672 | 1769 | } |
|---|
| 1673 | 1770 | |
|---|
| 1674 | 1771 | void ceph_free_cap_flush(struct ceph_cap_flush *cf) |
|---|
| .. | .. |
|---|
| 1692 | 1789 | * Remove cap_flush from the mdsc's or inode's flushing cap list. |
|---|
| 1693 | 1790 | * Return true if caller needs to wake up flush waiters. |
|---|
| 1694 | 1791 | */ |
|---|
| 1695 | | -static bool __finish_cap_flush(struct ceph_mds_client *mdsc, |
|---|
| 1696 | | - struct ceph_inode_info *ci, |
|---|
| 1697 | | - struct ceph_cap_flush *cf) |
|---|
| 1792 | +static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, |
|---|
| 1793 | + struct ceph_cap_flush *cf) |
|---|
| 1698 | 1794 | { |
|---|
| 1699 | 1795 | struct ceph_cap_flush *prev; |
|---|
| 1700 | 1796 | bool wake = cf->wake; |
|---|
| 1701 | | - if (mdsc) { |
|---|
| 1702 | | - /* are there older pending cap flushes? */ |
|---|
| 1703 | | - if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { |
|---|
| 1704 | | - prev = list_prev_entry(cf, g_list); |
|---|
| 1705 | | - prev->wake = true; |
|---|
| 1706 | | - wake = false; |
|---|
| 1707 | | - } |
|---|
| 1708 | | - list_del(&cf->g_list); |
|---|
| 1709 | | - } else if (ci) { |
|---|
| 1710 | | - if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { |
|---|
| 1711 | | - prev = list_prev_entry(cf, i_list); |
|---|
| 1712 | | - prev->wake = true; |
|---|
| 1713 | | - wake = false; |
|---|
| 1714 | | - } |
|---|
| 1715 | | - list_del(&cf->i_list); |
|---|
| 1716 | | - } else { |
|---|
| 1717 | | - BUG_ON(1); |
|---|
| 1797 | + |
|---|
| 1798 | + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { |
|---|
| 1799 | + prev = list_prev_entry(cf, g_list); |
|---|
| 1800 | + prev->wake = true; |
|---|
| 1801 | + wake = false; |
|---|
| 1718 | 1802 | } |
|---|
| 1803 | + list_del_init(&cf->g_list); |
|---|
| 1804 | + return wake; |
|---|
| 1805 | +} |
|---|
| 1806 | + |
|---|
| 1807 | +static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, |
|---|
| 1808 | + struct ceph_cap_flush *cf) |
|---|
| 1809 | +{ |
|---|
| 1810 | + struct ceph_cap_flush *prev; |
|---|
| 1811 | + bool wake = cf->wake; |
|---|
| 1812 | + |
|---|
| 1813 | + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { |
|---|
| 1814 | + prev = list_prev_entry(cf, i_list); |
|---|
| 1815 | + prev->wake = true; |
|---|
| 1816 | + wake = false; |
|---|
| 1817 | + } |
|---|
| 1818 | + list_del_init(&cf->i_list); |
|---|
| 1719 | 1819 | return wake; |
|---|
| 1720 | 1820 | } |
|---|
| 1721 | 1821 | |
|---|
| .. | .. |
|---|
| 1723 | 1823 | * Add dirty inode to the flushing list. Assigned a seq number so we |
|---|
| 1724 | 1824 | * can wait for caps to flush without starving. |
|---|
| 1725 | 1825 | * |
|---|
| 1726 | | - * Called under i_ceph_lock. |
|---|
| 1826 | + * Called under i_ceph_lock. Returns the flush tid. |
|---|
| 1727 | 1827 | */ |
|---|
| 1728 | | -static int __mark_caps_flushing(struct inode *inode, |
|---|
| 1828 | +static u64 __mark_caps_flushing(struct inode *inode, |
|---|
| 1729 | 1829 | struct ceph_mds_session *session, bool wake, |
|---|
| 1730 | | - u64 *flush_tid, u64 *oldest_flush_tid) |
|---|
| 1830 | + u64 *oldest_flush_tid) |
|---|
| 1731 | 1831 | { |
|---|
| 1732 | 1832 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
|---|
| 1733 | 1833 | struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 1734 | 1834 | struct ceph_cap_flush *cf = NULL; |
|---|
| 1735 | 1835 | int flushing; |
|---|
| 1736 | 1836 | |
|---|
| 1837 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 1737 | 1838 | BUG_ON(ci->i_dirty_caps == 0); |
|---|
| 1738 | 1839 | BUG_ON(list_empty(&ci->i_dirty_item)); |
|---|
| 1739 | 1840 | BUG_ON(!ci->i_prealloc_cap_flush); |
|---|
| .. | .. |
|---|
| 1766 | 1867 | |
|---|
| 1767 | 1868 | list_add_tail(&cf->i_list, &ci->i_cap_flush_list); |
|---|
| 1768 | 1869 | |
|---|
| 1769 | | - *flush_tid = cf->tid; |
|---|
| 1770 | | - return flushing; |
|---|
| 1870 | + return cf->tid; |
|---|
| 1771 | 1871 | } |
|---|
| 1772 | 1872 | |
|---|
| 1773 | 1873 | /* |
|---|
| .. | .. |
|---|
| 1817 | 1917 | * versus held caps. Release, flush, ack revoked caps to mds as |
|---|
| 1818 | 1918 | * appropriate. |
|---|
| 1819 | 1919 | * |
|---|
| 1820 | | - * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay |
|---|
| 1821 | | - * cap release further. |
|---|
| 1822 | 1920 | * CHECK_CAPS_AUTHONLY - we should only check the auth cap |
|---|
| 1823 | 1921 | * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without |
|---|
| 1824 | 1922 | * further delay. |
|---|
| .. | .. |
|---|
| 1826 | 1924 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, |
|---|
| 1827 | 1925 | struct ceph_mds_session *session) |
|---|
| 1828 | 1926 | { |
|---|
| 1829 | | - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); |
|---|
| 1830 | | - struct ceph_mds_client *mdsc = fsc->mdsc; |
|---|
| 1831 | 1927 | struct inode *inode = &ci->vfs_inode; |
|---|
| 1928 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); |
|---|
| 1832 | 1929 | struct ceph_cap *cap; |
|---|
| 1833 | 1930 | u64 flush_tid, oldest_flush_tid; |
|---|
| 1834 | 1931 | int file_wanted, used, cap_used; |
|---|
| .. | .. |
|---|
| 1837 | 1934 | int mds = -1; /* keep track of how far we've gone through i_caps list |
|---|
| 1838 | 1935 | to avoid an infinite loop on retry */ |
|---|
| 1839 | 1936 | struct rb_node *p; |
|---|
| 1840 | | - int delayed = 0, sent = 0; |
|---|
| 1841 | | - bool no_delay = flags & CHECK_CAPS_NODELAY; |
|---|
| 1842 | 1937 | bool queue_invalidate = false; |
|---|
| 1843 | 1938 | bool tried_invalidate = false; |
|---|
| 1844 | 1939 | |
|---|
| 1845 | | - /* if we are unmounting, flush any unused caps immediately. */ |
|---|
| 1846 | | - if (mdsc->stopping) |
|---|
| 1847 | | - no_delay = true; |
|---|
| 1848 | | - |
|---|
| 1849 | 1940 | spin_lock(&ci->i_ceph_lock); |
|---|
| 1850 | | - |
|---|
| 1851 | 1941 | if (ci->i_ceph_flags & CEPH_I_FLUSH) |
|---|
| 1852 | 1942 | flags |= CHECK_CAPS_FLUSH; |
|---|
| 1853 | | - |
|---|
| 1854 | | - if (!(flags & CHECK_CAPS_AUTHONLY) || |
|---|
| 1855 | | - (ci->i_auth_cap && __ceph_is_single_caps(ci))) |
|---|
| 1856 | | - __cap_delay_cancel(mdsc, ci); |
|---|
| 1857 | 1943 | |
|---|
| 1858 | 1944 | goto retry_locked; |
|---|
| 1859 | 1945 | retry: |
|---|
| 1860 | 1946 | spin_lock(&ci->i_ceph_lock); |
|---|
| 1861 | 1947 | retry_locked: |
|---|
| 1948 | + /* Caps wanted by virtue of active open files. */ |
|---|
| 1862 | 1949 | file_wanted = __ceph_caps_file_wanted(ci); |
|---|
| 1950 | + |
|---|
| 1951 | + /* Caps which have active references against them */ |
|---|
| 1863 | 1952 | used = __ceph_caps_used(ci); |
|---|
| 1953 | + |
|---|
| 1954 | + /* |
|---|
| 1955 | + * "issued" represents the current caps that the MDS wants us to have. |
|---|
| 1956 | + * "implemented" is the set that we have been granted, and includes the |
|---|
| 1957 | + * ones that have not yet been returned to the MDS (the "revoking" set, |
|---|
| 1958 | + * usually because they have outstanding references). |
|---|
| 1959 | + */ |
|---|
| 1864 | 1960 | issued = __ceph_caps_issued(ci, &implemented); |
|---|
| 1865 | 1961 | revoking = implemented & ~issued; |
|---|
| 1866 | 1962 | |
|---|
| 1867 | 1963 | want = file_wanted; |
|---|
| 1964 | + |
|---|
| 1965 | + /* The ones we currently want to retain (may be adjusted below) */ |
|---|
| 1868 | 1966 | retain = file_wanted | used | CEPH_CAP_PIN; |
|---|
| 1869 | 1967 | if (!mdsc->stopping && inode->i_nlink > 0) { |
|---|
| 1870 | 1968 | if (file_wanted) { |
|---|
| 1871 | 1969 | retain |= CEPH_CAP_ANY; /* be greedy */ |
|---|
| 1872 | 1970 | } else if (S_ISDIR(inode->i_mode) && |
|---|
| 1873 | 1971 | (issued & CEPH_CAP_FILE_SHARED) && |
|---|
| 1874 | | - __ceph_dir_is_complete(ci)) { |
|---|
| 1972 | + __ceph_dir_is_complete(ci)) { |
|---|
| 1875 | 1973 | /* |
|---|
| 1876 | 1974 | * If a directory is complete, we want to keep |
|---|
| 1877 | 1975 | * the exclusive cap. So that MDS does not end up |
|---|
| 1878 | 1976 | * revoking the shared cap on every create/unlink |
|---|
| 1879 | 1977 | * operation. |
|---|
| 1880 | 1978 | */ |
|---|
| 1881 | | - want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; |
|---|
| 1979 | + if (IS_RDONLY(inode)) { |
|---|
| 1980 | + want = CEPH_CAP_ANY_SHARED; |
|---|
| 1981 | + } else { |
|---|
| 1982 | + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; |
|---|
| 1983 | + } |
|---|
| 1882 | 1984 | retain |= want; |
|---|
| 1883 | 1985 | } else { |
|---|
| 1884 | 1986 | |
|---|
| .. | .. |
|---|
| 1894 | 1996 | } |
|---|
| 1895 | 1997 | |
|---|
| 1896 | 1998 | dout("check_caps %p file_want %s used %s dirty %s flushing %s" |
|---|
| 1897 | | - " issued %s revoking %s retain %s %s%s%s\n", inode, |
|---|
| 1999 | + " issued %s revoking %s retain %s %s%s\n", inode, |
|---|
| 1898 | 2000 | ceph_cap_string(file_wanted), |
|---|
| 1899 | 2001 | ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), |
|---|
| 1900 | 2002 | ceph_cap_string(ci->i_flushing_caps), |
|---|
| 1901 | 2003 | ceph_cap_string(issued), ceph_cap_string(revoking), |
|---|
| 1902 | 2004 | ceph_cap_string(retain), |
|---|
| 1903 | 2005 | (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", |
|---|
| 1904 | | - (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", |
|---|
| 1905 | 2006 | (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); |
|---|
| 1906 | 2007 | |
|---|
| 1907 | 2008 | /* |
|---|
| .. | .. |
|---|
| 1909 | 2010 | * have cached pages, but don't want them, then try to invalidate. |
|---|
| 1910 | 2011 | * If we fail, it's because pages are locked.... try again later. |
|---|
| 1911 | 2012 | */ |
|---|
| 1912 | | - if ((!no_delay || mdsc->stopping) && |
|---|
| 1913 | | - !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ |
|---|
| 2013 | + if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && |
|---|
| 2014 | + S_ISREG(inode->i_mode) && |
|---|
| 1914 | 2015 | !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ |
|---|
| 1915 | 2016 | inode->i_data.nrpages && /* have cached pages */ |
|---|
| 1916 | 2017 | (revoking & (CEPH_CAP_FILE_CACHE| |
|---|
| .. | .. |
|---|
| 1927 | 2028 | } |
|---|
| 1928 | 2029 | |
|---|
| 1929 | 2030 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
|---|
| 2031 | + int mflags = 0; |
|---|
| 2032 | + struct cap_msg_args arg; |
|---|
| 2033 | + |
|---|
| 1930 | 2034 | cap = rb_entry(p, struct ceph_cap, ci_node); |
|---|
| 1931 | 2035 | |
|---|
| 1932 | 2036 | /* avoid looping forever */ |
|---|
| .. | .. |
|---|
| 1936 | 2040 | |
|---|
| 1937 | 2041 | /* NOTE: no side-effects allowed, until we take s_mutex */ |
|---|
| 1938 | 2042 | |
|---|
| 2043 | + /* |
|---|
| 2044 | + * If we have an auth cap, we don't need to consider any |
|---|
| 2045 | + * overlapping caps as used. |
|---|
| 2046 | + */ |
|---|
| 1939 | 2047 | cap_used = used; |
|---|
| 1940 | 2048 | if (ci->i_auth_cap && cap != ci->i_auth_cap) |
|---|
| 1941 | 2049 | cap_used &= ~ci->i_auth_cap->issued; |
|---|
| .. | .. |
|---|
| 1990 | 2098 | } |
|---|
| 1991 | 2099 | |
|---|
| 1992 | 2100 | /* things we might delay */ |
|---|
| 1993 | | - if ((cap->issued & ~retain) == 0 && |
|---|
| 1994 | | - cap->mds_wanted == want) |
|---|
| 2101 | + if ((cap->issued & ~retain) == 0) |
|---|
| 1995 | 2102 | continue; /* nope, all good */ |
|---|
| 1996 | 2103 | |
|---|
| 1997 | | - if (no_delay) |
|---|
| 1998 | | - goto ack; |
|---|
| 1999 | | - |
|---|
| 2000 | | - /* delay? */ |
|---|
| 2001 | | - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && |
|---|
| 2002 | | - time_before(jiffies, ci->i_hold_caps_max)) { |
|---|
| 2003 | | - dout(" delaying issued %s -> %s, wanted %s -> %s\n", |
|---|
| 2004 | | - ceph_cap_string(cap->issued), |
|---|
| 2005 | | - ceph_cap_string(cap->issued & retain), |
|---|
| 2006 | | - ceph_cap_string(cap->mds_wanted), |
|---|
| 2007 | | - ceph_cap_string(want)); |
|---|
| 2008 | | - delayed++; |
|---|
| 2009 | | - continue; |
|---|
| 2010 | | - } |
|---|
| 2011 | | - |
|---|
| 2012 | 2104 | ack: |
|---|
| 2013 | | - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { |
|---|
| 2014 | | - dout(" skipping %p I_NOFLUSH set\n", inode); |
|---|
| 2015 | | - continue; |
|---|
| 2016 | | - } |
|---|
| 2017 | | - |
|---|
| 2018 | 2105 | if (session && session != cap->session) { |
|---|
| 2019 | 2106 | dout("oops, wrong session %p mutex\n", session); |
|---|
| 2020 | 2107 | mutex_unlock(&session->s_mutex); |
|---|
| .. | .. |
|---|
| 2052 | 2139 | if (cap == ci->i_auth_cap && |
|---|
| 2053 | 2140 | (ci->i_ceph_flags & |
|---|
| 2054 | 2141 | (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { |
|---|
| 2055 | | - if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { |
|---|
| 2142 | + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) |
|---|
| 2056 | 2143 | __kick_flushing_caps(mdsc, session, ci, 0); |
|---|
| 2057 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
|---|
| 2058 | | - } |
|---|
| 2059 | 2144 | if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) |
|---|
| 2060 | 2145 | __ceph_flush_snaps(ci, session); |
|---|
| 2061 | 2146 | |
|---|
| .. | .. |
|---|
| 2076 | 2161 | } |
|---|
| 2077 | 2162 | |
|---|
| 2078 | 2163 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) { |
|---|
| 2079 | | - flushing = __mark_caps_flushing(inode, session, false, |
|---|
| 2080 | | - &flush_tid, |
|---|
| 2081 | | - &oldest_flush_tid); |
|---|
| 2164 | + flushing = ci->i_dirty_caps; |
|---|
| 2165 | + flush_tid = __mark_caps_flushing(inode, session, false, |
|---|
| 2166 | + &oldest_flush_tid); |
|---|
| 2167 | + if (flags & CHECK_CAPS_FLUSH && |
|---|
| 2168 | + list_empty(&session->s_cap_dirty)) |
|---|
| 2169 | + mflags |= CEPH_CLIENT_CAPS_SYNC; |
|---|
| 2082 | 2170 | } else { |
|---|
| 2083 | 2171 | flushing = 0; |
|---|
| 2084 | 2172 | flush_tid = 0; |
|---|
| .. | .. |
|---|
| 2088 | 2176 | } |
|---|
| 2089 | 2177 | |
|---|
| 2090 | 2178 | mds = cap->mds; /* remember mds, so we don't repeat */ |
|---|
| 2091 | | - sent++; |
|---|
| 2092 | 2179 | |
|---|
| 2093 | | - /* __send_cap drops i_ceph_lock */ |
|---|
| 2094 | | - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false, |
|---|
| 2095 | | - cap_used, want, retain, flushing, |
|---|
| 2096 | | - flush_tid, oldest_flush_tid); |
|---|
| 2180 | + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, |
|---|
| 2181 | + want, retain, flushing, flush_tid, oldest_flush_tid); |
|---|
| 2182 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 2183 | + |
|---|
| 2184 | + __send_cap(&arg, ci); |
|---|
| 2185 | + |
|---|
| 2097 | 2186 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ |
|---|
| 2098 | 2187 | } |
|---|
| 2099 | 2188 | |
|---|
| 2100 | | - /* Reschedule delayed caps release if we delayed anything */ |
|---|
| 2101 | | - if (delayed) |
|---|
| 2189 | + /* periodically re-calculate caps wanted by open files */ |
|---|
| 2190 | + if (__ceph_is_any_real_caps(ci) && |
|---|
| 2191 | + list_empty(&ci->i_cap_delay_list) && |
|---|
| 2192 | + (file_wanted & ~CEPH_CAP_PIN) && |
|---|
| 2193 | + !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { |
|---|
| 2102 | 2194 | __cap_delay_requeue(mdsc, ci); |
|---|
| 2195 | + } |
|---|
| 2103 | 2196 | |
|---|
| 2104 | 2197 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 2105 | 2198 | |
|---|
| .. | .. |
|---|
| 2125 | 2218 | |
|---|
| 2126 | 2219 | retry: |
|---|
| 2127 | 2220 | spin_lock(&ci->i_ceph_lock); |
|---|
| 2128 | | - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { |
|---|
| 2129 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 2130 | | - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); |
|---|
| 2131 | | - goto out; |
|---|
| 2132 | | - } |
|---|
| 2221 | +retry_locked: |
|---|
| 2133 | 2222 | if (ci->i_dirty_caps && ci->i_auth_cap) { |
|---|
| 2134 | 2223 | struct ceph_cap *cap = ci->i_auth_cap; |
|---|
| 2135 | | - int used = __ceph_caps_used(ci); |
|---|
| 2136 | | - int want = __ceph_caps_wanted(ci); |
|---|
| 2137 | | - int delayed; |
|---|
| 2224 | + struct cap_msg_args arg; |
|---|
| 2138 | 2225 | |
|---|
| 2139 | | - if (!session || session != cap->session) { |
|---|
| 2226 | + if (session != cap->session) { |
|---|
| 2140 | 2227 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 2141 | 2228 | if (session) |
|---|
| 2142 | 2229 | mutex_unlock(&session->s_mutex); |
|---|
| .. | .. |
|---|
| 2149 | 2236 | goto out; |
|---|
| 2150 | 2237 | } |
|---|
| 2151 | 2238 | |
|---|
| 2152 | | - flushing = __mark_caps_flushing(inode, session, true, |
|---|
| 2153 | | - &flush_tid, &oldest_flush_tid); |
|---|
| 2154 | | - |
|---|
| 2155 | | - /* __send_cap drops i_ceph_lock */ |
|---|
| 2156 | | - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true, |
|---|
| 2157 | | - used, want, (cap->issued | cap->implemented), |
|---|
| 2158 | | - flushing, flush_tid, oldest_flush_tid); |
|---|
| 2159 | | - |
|---|
| 2160 | | - if (delayed) { |
|---|
| 2161 | | - spin_lock(&ci->i_ceph_lock); |
|---|
| 2162 | | - __cap_delay_requeue(mdsc, ci); |
|---|
| 2163 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 2239 | + if (ci->i_ceph_flags & |
|---|
| 2240 | + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { |
|---|
| 2241 | + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) |
|---|
| 2242 | + __kick_flushing_caps(mdsc, session, ci, 0); |
|---|
| 2243 | + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) |
|---|
| 2244 | + __ceph_flush_snaps(ci, session); |
|---|
| 2245 | + goto retry_locked; |
|---|
| 2164 | 2246 | } |
|---|
| 2247 | + |
|---|
| 2248 | + flushing = ci->i_dirty_caps; |
|---|
| 2249 | + flush_tid = __mark_caps_flushing(inode, session, true, |
|---|
| 2250 | + &oldest_flush_tid); |
|---|
| 2251 | + |
|---|
| 2252 | + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, |
|---|
| 2253 | + __ceph_caps_used(ci), __ceph_caps_wanted(ci), |
|---|
| 2254 | + (cap->issued | cap->implemented), |
|---|
| 2255 | + flushing, flush_tid, oldest_flush_tid); |
|---|
| 2256 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 2257 | + |
|---|
| 2258 | + __send_cap(&arg, ci); |
|---|
| 2165 | 2259 | } else { |
|---|
| 2166 | 2260 | if (!list_empty(&ci->i_cap_flush_list)) { |
|---|
| 2167 | 2261 | struct ceph_cap_flush *cf = |
|---|
| .. | .. |
|---|
| 2206 | 2300 | */ |
|---|
| 2207 | 2301 | static int unsafe_request_wait(struct inode *inode) |
|---|
| 2208 | 2302 | { |
|---|
| 2303 | + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
|---|
| 2209 | 2304 | struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 2210 | 2305 | struct ceph_mds_request *req1 = NULL, *req2 = NULL; |
|---|
| 2211 | 2306 | int ret, err = 0; |
|---|
| .. | .. |
|---|
| 2225 | 2320 | } |
|---|
| 2226 | 2321 | spin_unlock(&ci->i_unsafe_lock); |
|---|
| 2227 | 2322 | |
|---|
| 2323 | + /* |
|---|
| 2324 | + * Trigger to flush the journal logs in all the relevant MDSes |
|---|
| 2325 | + * manually, or in the worst case we must wait at most 5 seconds |
|---|
| 2326 | + * to wait the journal logs to be flushed by the MDSes periodically. |
|---|
| 2327 | + */ |
|---|
| 2328 | + if (req1 || req2) { |
|---|
| 2329 | + struct ceph_mds_request *req; |
|---|
| 2330 | + struct ceph_mds_session **sessions; |
|---|
| 2331 | + struct ceph_mds_session *s; |
|---|
| 2332 | + unsigned int max_sessions; |
|---|
| 2333 | + int i; |
|---|
| 2334 | + |
|---|
| 2335 | + mutex_lock(&mdsc->mutex); |
|---|
| 2336 | + max_sessions = mdsc->max_sessions; |
|---|
| 2337 | + |
|---|
| 2338 | + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); |
|---|
| 2339 | + if (!sessions) { |
|---|
| 2340 | + mutex_unlock(&mdsc->mutex); |
|---|
| 2341 | + err = -ENOMEM; |
|---|
| 2342 | + goto out; |
|---|
| 2343 | + } |
|---|
| 2344 | + |
|---|
| 2345 | + spin_lock(&ci->i_unsafe_lock); |
|---|
| 2346 | + if (req1) { |
|---|
| 2347 | + list_for_each_entry(req, &ci->i_unsafe_dirops, |
|---|
| 2348 | + r_unsafe_dir_item) { |
|---|
| 2349 | + s = req->r_session; |
|---|
| 2350 | + if (!s) |
|---|
| 2351 | + continue; |
|---|
| 2352 | + if (!sessions[s->s_mds]) { |
|---|
| 2353 | + s = ceph_get_mds_session(s); |
|---|
| 2354 | + sessions[s->s_mds] = s; |
|---|
| 2355 | + } |
|---|
| 2356 | + } |
|---|
| 2357 | + } |
|---|
| 2358 | + if (req2) { |
|---|
| 2359 | + list_for_each_entry(req, &ci->i_unsafe_iops, |
|---|
| 2360 | + r_unsafe_target_item) { |
|---|
| 2361 | + s = req->r_session; |
|---|
| 2362 | + if (!s) |
|---|
| 2363 | + continue; |
|---|
| 2364 | + if (!sessions[s->s_mds]) { |
|---|
| 2365 | + s = ceph_get_mds_session(s); |
|---|
| 2366 | + sessions[s->s_mds] = s; |
|---|
| 2367 | + } |
|---|
| 2368 | + } |
|---|
| 2369 | + } |
|---|
| 2370 | + spin_unlock(&ci->i_unsafe_lock); |
|---|
| 2371 | + |
|---|
| 2372 | + /* the auth MDS */ |
|---|
| 2373 | + spin_lock(&ci->i_ceph_lock); |
|---|
| 2374 | + if (ci->i_auth_cap) { |
|---|
| 2375 | + s = ci->i_auth_cap->session; |
|---|
| 2376 | + if (!sessions[s->s_mds]) |
|---|
| 2377 | + sessions[s->s_mds] = ceph_get_mds_session(s); |
|---|
| 2378 | + } |
|---|
| 2379 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 2380 | + mutex_unlock(&mdsc->mutex); |
|---|
| 2381 | + |
|---|
| 2382 | + /* send flush mdlog request to MDSes */ |
|---|
| 2383 | + for (i = 0; i < max_sessions; i++) { |
|---|
| 2384 | + s = sessions[i]; |
|---|
| 2385 | + if (s) { |
|---|
| 2386 | + send_flush_mdlog(s); |
|---|
| 2387 | + ceph_put_mds_session(s); |
|---|
| 2388 | + } |
|---|
| 2389 | + } |
|---|
| 2390 | + kfree(sessions); |
|---|
| 2391 | + } |
|---|
| 2392 | + |
|---|
| 2228 | 2393 | dout("unsafe_request_wait %p wait on tid %llu %llu\n", |
|---|
| 2229 | 2394 | inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); |
|---|
| 2230 | 2395 | if (req1) { |
|---|
| .. | .. |
|---|
| 2232 | 2397 | ceph_timeout_jiffies(req1->r_timeout)); |
|---|
| 2233 | 2398 | if (ret) |
|---|
| 2234 | 2399 | err = -EIO; |
|---|
| 2235 | | - ceph_mdsc_put_request(req1); |
|---|
| 2236 | 2400 | } |
|---|
| 2237 | 2401 | if (req2) { |
|---|
| 2238 | 2402 | ret = !wait_for_completion_timeout(&req2->r_safe_completion, |
|---|
| 2239 | 2403 | ceph_timeout_jiffies(req2->r_timeout)); |
|---|
| 2240 | 2404 | if (ret) |
|---|
| 2241 | 2405 | err = -EIO; |
|---|
| 2242 | | - ceph_mdsc_put_request(req2); |
|---|
| 2243 | 2406 | } |
|---|
| 2407 | + |
|---|
| 2408 | +out: |
|---|
| 2409 | + if (req1) |
|---|
| 2410 | + ceph_mdsc_put_request(req1); |
|---|
| 2411 | + if (req2) |
|---|
| 2412 | + ceph_mdsc_put_request(req2); |
|---|
| 2244 | 2413 | return err; |
|---|
| 2245 | 2414 | } |
|---|
| 2246 | 2415 | |
|---|
| .. | .. |
|---|
| 2249 | 2418 | struct inode *inode = file->f_mapping->host; |
|---|
| 2250 | 2419 | struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 2251 | 2420 | u64 flush_tid; |
|---|
| 2252 | | - int ret; |
|---|
| 2421 | + int ret, err; |
|---|
| 2253 | 2422 | int dirty; |
|---|
| 2254 | 2423 | |
|---|
| 2255 | 2424 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); |
|---|
| 2256 | 2425 | |
|---|
| 2257 | 2426 | ret = file_write_and_wait_range(file, start, end); |
|---|
| 2258 | | - if (ret < 0) |
|---|
| 2259 | | - goto out; |
|---|
| 2260 | | - |
|---|
| 2261 | 2427 | if (datasync) |
|---|
| 2262 | 2428 | goto out; |
|---|
| 2263 | 2429 | |
|---|
| 2264 | | - inode_lock(inode); |
|---|
| 2430 | + ret = ceph_wait_on_async_create(inode); |
|---|
| 2431 | + if (ret) |
|---|
| 2432 | + goto out; |
|---|
| 2265 | 2433 | |
|---|
| 2266 | 2434 | dirty = try_flush_caps(inode, &flush_tid); |
|---|
| 2267 | 2435 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); |
|---|
| 2268 | 2436 | |
|---|
| 2269 | | - ret = unsafe_request_wait(inode); |
|---|
| 2437 | + err = unsafe_request_wait(inode); |
|---|
| 2270 | 2438 | |
|---|
| 2271 | 2439 | /* |
|---|
| 2272 | 2440 | * only wait on non-file metadata writeback (the mds |
|---|
| 2273 | 2441 | * can recover size and mtime, so we don't need to |
|---|
| 2274 | 2442 | * wait for that) |
|---|
| 2275 | 2443 | */ |
|---|
| 2276 | | - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { |
|---|
| 2277 | | - ret = wait_event_interruptible(ci->i_cap_wq, |
|---|
| 2444 | + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { |
|---|
| 2445 | + err = wait_event_interruptible(ci->i_cap_wq, |
|---|
| 2278 | 2446 | caps_are_flushed(inode, flush_tid)); |
|---|
| 2279 | 2447 | } |
|---|
| 2280 | | - inode_unlock(inode); |
|---|
| 2448 | + |
|---|
| 2449 | + if (err < 0) |
|---|
| 2450 | + ret = err; |
|---|
| 2451 | + |
|---|
| 2452 | + err = file_check_and_advance_wb_err(file); |
|---|
| 2453 | + if (err < 0) |
|---|
| 2454 | + ret = err; |
|---|
| 2281 | 2455 | out: |
|---|
| 2282 | 2456 | dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); |
|---|
| 2283 | 2457 | return ret; |
|---|
| .. | .. |
|---|
| 2327 | 2501 | struct ceph_cap_flush *cf; |
|---|
| 2328 | 2502 | int ret; |
|---|
| 2329 | 2503 | u64 first_tid = 0; |
|---|
| 2504 | + u64 last_snap_flush = 0; |
|---|
| 2505 | + |
|---|
| 2506 | + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
|---|
| 2507 | + |
|---|
| 2508 | + list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { |
|---|
| 2509 | + if (cf->is_capsnap) { |
|---|
| 2510 | + last_snap_flush = cf->tid; |
|---|
| 2511 | + break; |
|---|
| 2512 | + } |
|---|
| 2513 | + } |
|---|
| 2330 | 2514 | |
|---|
| 2331 | 2515 | list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { |
|---|
| 2332 | 2516 | if (cf->tid < first_tid) |
|---|
| .. | .. |
|---|
| 2341 | 2525 | |
|---|
| 2342 | 2526 | first_tid = cf->tid + 1; |
|---|
| 2343 | 2527 | |
|---|
| 2344 | | - if (cf->caps) { |
|---|
| 2528 | + if (!cf->is_capsnap) { |
|---|
| 2529 | + struct cap_msg_args arg; |
|---|
| 2530 | + |
|---|
| 2345 | 2531 | dout("kick_flushing_caps %p cap %p tid %llu %s\n", |
|---|
| 2346 | 2532 | inode, cap, cf->tid, ceph_cap_string(cf->caps)); |
|---|
| 2347 | | - ci->i_ceph_flags |= CEPH_I_NODELAY; |
|---|
| 2348 | | - ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, |
|---|
| 2349 | | - false, __ceph_caps_used(ci), |
|---|
| 2533 | + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, |
|---|
| 2534 | + (cf->tid < last_snap_flush ? |
|---|
| 2535 | + CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), |
|---|
| 2536 | + __ceph_caps_used(ci), |
|---|
| 2350 | 2537 | __ceph_caps_wanted(ci), |
|---|
| 2351 | | - cap->issued | cap->implemented, |
|---|
| 2538 | + (cap->issued | cap->implemented), |
|---|
| 2352 | 2539 | cf->caps, cf->tid, oldest_flush_tid); |
|---|
| 2353 | | - if (ret) { |
|---|
| 2354 | | - pr_err("kick_flushing_caps: error sending " |
|---|
| 2355 | | - "cap flush, ino (%llx.%llx) " |
|---|
| 2356 | | - "tid %llu flushing %s\n", |
|---|
| 2357 | | - ceph_vinop(inode), cf->tid, |
|---|
| 2358 | | - ceph_cap_string(cf->caps)); |
|---|
| 2359 | | - } |
|---|
| 2540 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 2541 | + __send_cap(&arg, ci); |
|---|
| 2360 | 2542 | } else { |
|---|
| 2361 | 2543 | struct ceph_cap_snap *capsnap = |
|---|
| 2362 | 2544 | container_of(cf, struct ceph_cap_snap, |
|---|
| .. | .. |
|---|
| 2417 | 2599 | */ |
|---|
| 2418 | 2600 | if ((cap->issued & ci->i_flushing_caps) != |
|---|
| 2419 | 2601 | ci->i_flushing_caps) { |
|---|
| 2420 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
|---|
| 2602 | + /* encode_caps_cb() also will reset these sequence |
|---|
| 2603 | + * numbers. make sure sequence numbers in cap flush |
|---|
| 2604 | + * message match later reconnect message */ |
|---|
| 2605 | + cap->seq = 0; |
|---|
| 2606 | + cap->issue_seq = 0; |
|---|
| 2607 | + cap->mseq = 0; |
|---|
| 2421 | 2608 | __kick_flushing_caps(mdsc, session, ci, |
|---|
| 2422 | 2609 | oldest_flush_tid); |
|---|
| 2423 | 2610 | } else { |
|---|
| .. | .. |
|---|
| 2435 | 2622 | struct ceph_cap *cap; |
|---|
| 2436 | 2623 | u64 oldest_flush_tid; |
|---|
| 2437 | 2624 | |
|---|
| 2625 | + lockdep_assert_held(&session->s_mutex); |
|---|
| 2626 | + |
|---|
| 2438 | 2627 | dout("kick_flushing_caps mds%d\n", session->s_mds); |
|---|
| 2439 | 2628 | |
|---|
| 2440 | 2629 | spin_lock(&mdsc->cap_dirty_lock); |
|---|
| .. | .. |
|---|
| 2451 | 2640 | continue; |
|---|
| 2452 | 2641 | } |
|---|
| 2453 | 2642 | if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { |
|---|
| 2454 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
|---|
| 2455 | 2643 | __kick_flushing_caps(mdsc, session, ci, |
|---|
| 2456 | 2644 | oldest_flush_tid); |
|---|
| 2457 | 2645 | } |
|---|
| .. | .. |
|---|
| 2459 | 2647 | } |
|---|
| 2460 | 2648 | } |
|---|
| 2461 | 2649 | |
|---|
| 2462 | | -static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, |
|---|
| 2463 | | - struct ceph_mds_session *session, |
|---|
| 2464 | | - struct inode *inode) |
|---|
| 2465 | | - __releases(ci->i_ceph_lock) |
|---|
| 2650 | +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, |
|---|
| 2651 | + struct ceph_inode_info *ci) |
|---|
| 2466 | 2652 | { |
|---|
| 2467 | | - struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 2468 | | - struct ceph_cap *cap; |
|---|
| 2653 | + struct ceph_mds_client *mdsc = session->s_mdsc; |
|---|
| 2654 | + struct ceph_cap *cap = ci->i_auth_cap; |
|---|
| 2469 | 2655 | |
|---|
| 2470 | | - cap = ci->i_auth_cap; |
|---|
| 2471 | | - dout("kick_flushing_inode_caps %p flushing %s\n", inode, |
|---|
| 2656 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 2657 | + |
|---|
| 2658 | + dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, |
|---|
| 2472 | 2659 | ceph_cap_string(ci->i_flushing_caps)); |
|---|
| 2473 | 2660 | |
|---|
| 2474 | 2661 | if (!list_empty(&ci->i_cap_flush_list)) { |
|---|
| .. | .. |
|---|
| 2479 | 2666 | oldest_flush_tid = __get_oldest_flush_tid(mdsc); |
|---|
| 2480 | 2667 | spin_unlock(&mdsc->cap_dirty_lock); |
|---|
| 2481 | 2668 | |
|---|
| 2482 | | - ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; |
|---|
| 2483 | 2669 | __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); |
|---|
| 2484 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 2485 | | - } else { |
|---|
| 2486 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 2487 | 2670 | } |
|---|
| 2488 | 2671 | } |
|---|
| 2489 | 2672 | |
|---|
| .. | .. |
|---|
| 2491 | 2674 | /* |
|---|
| 2492 | 2675 | * Take references to capabilities we hold, so that we don't release |
|---|
| 2493 | 2676 | * them to the MDS prematurely. |
|---|
| 2494 | | - * |
|---|
| 2495 | | - * Protected by i_ceph_lock. |
|---|
| 2496 | 2677 | */ |
|---|
| 2497 | | -static void __take_cap_refs(struct ceph_inode_info *ci, int got, |
|---|
| 2678 | +void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, |
|---|
| 2498 | 2679 | bool snap_rwsem_locked) |
|---|
| 2499 | 2680 | { |
|---|
| 2681 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 2682 | + |
|---|
| 2500 | 2683 | if (got & CEPH_CAP_PIN) |
|---|
| 2501 | 2684 | ci->i_pin_ref++; |
|---|
| 2502 | 2685 | if (got & CEPH_CAP_FILE_RD) |
|---|
| 2503 | 2686 | ci->i_rd_ref++; |
|---|
| 2504 | 2687 | if (got & CEPH_CAP_FILE_CACHE) |
|---|
| 2505 | 2688 | ci->i_rdcache_ref++; |
|---|
| 2689 | + if (got & CEPH_CAP_FILE_EXCL) |
|---|
| 2690 | + ci->i_fx_ref++; |
|---|
| 2506 | 2691 | if (got & CEPH_CAP_FILE_WR) { |
|---|
| 2507 | 2692 | if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { |
|---|
| 2508 | 2693 | BUG_ON(!snap_rwsem_locked); |
|---|
| .. | .. |
|---|
| 2515 | 2700 | if (ci->i_wb_ref == 0) |
|---|
| 2516 | 2701 | ihold(&ci->vfs_inode); |
|---|
| 2517 | 2702 | ci->i_wb_ref++; |
|---|
| 2518 | | - dout("__take_cap_refs %p wb %d -> %d (?)\n", |
|---|
| 2703 | + dout("%s %p wb %d -> %d (?)\n", __func__, |
|---|
| 2519 | 2704 | &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); |
|---|
| 2520 | 2705 | } |
|---|
| 2521 | 2706 | } |
|---|
| .. | .. |
|---|
| 2526 | 2711 | * to (when applicable), and check against max_size here as well. |
|---|
| 2527 | 2712 | * Note that caller is responsible for ensuring max_size increases are |
|---|
| 2528 | 2713 | * requested from the MDS. |
|---|
| 2714 | + * |
|---|
| 2715 | + * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, |
|---|
| 2716 | + * or a negative error code. There are 3 speical error codes: |
|---|
| 2717 | + * -EAGAIN: need to sleep but non-blocking is specified |
|---|
| 2718 | + * -EFBIG: ask caller to call check_max_size() and try again. |
|---|
| 2719 | + * -ESTALE: ask caller to call ceph_renew_caps() and try again. |
|---|
| 2529 | 2720 | */ |
|---|
| 2530 | | -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, |
|---|
| 2531 | | - loff_t endoff, bool nonblock, int *got, int *err) |
|---|
| 2721 | +enum { |
|---|
| 2722 | + /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ |
|---|
| 2723 | + NON_BLOCKING = (1 << 8), |
|---|
| 2724 | + CHECK_FILELOCK = (1 << 9), |
|---|
| 2725 | +}; |
|---|
| 2726 | + |
|---|
| 2727 | +static int try_get_cap_refs(struct inode *inode, int need, int want, |
|---|
| 2728 | + loff_t endoff, int flags, int *got) |
|---|
| 2532 | 2729 | { |
|---|
| 2533 | | - struct inode *inode = &ci->vfs_inode; |
|---|
| 2730 | + struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 2534 | 2731 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
|---|
| 2535 | 2732 | int ret = 0; |
|---|
| 2536 | 2733 | int have, implemented; |
|---|
| 2537 | | - int file_wanted; |
|---|
| 2538 | 2734 | bool snap_rwsem_locked = false; |
|---|
| 2539 | 2735 | |
|---|
| 2540 | 2736 | dout("get_cap_refs %p need %s want %s\n", inode, |
|---|
| .. | .. |
|---|
| 2543 | 2739 | again: |
|---|
| 2544 | 2740 | spin_lock(&ci->i_ceph_lock); |
|---|
| 2545 | 2741 | |
|---|
| 2546 | | - /* make sure file is actually open */ |
|---|
| 2547 | | - file_wanted = __ceph_caps_file_wanted(ci); |
|---|
| 2548 | | - if ((file_wanted & need) != need) { |
|---|
| 2549 | | - dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", |
|---|
| 2550 | | - ceph_cap_string(need), ceph_cap_string(file_wanted)); |
|---|
| 2551 | | - *err = -EBADF; |
|---|
| 2552 | | - ret = 1; |
|---|
| 2742 | + if ((flags & CHECK_FILELOCK) && |
|---|
| 2743 | + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { |
|---|
| 2744 | + dout("try_get_cap_refs %p error filelock\n", inode); |
|---|
| 2745 | + ret = -EIO; |
|---|
| 2553 | 2746 | goto out_unlock; |
|---|
| 2554 | 2747 | } |
|---|
| 2555 | 2748 | |
|---|
| .. | .. |
|---|
| 2570 | 2763 | if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { |
|---|
| 2571 | 2764 | dout("get_cap_refs %p endoff %llu > maxsize %llu\n", |
|---|
| 2572 | 2765 | inode, endoff, ci->i_max_size); |
|---|
| 2573 | | - if (endoff > ci->i_requested_max_size) { |
|---|
| 2574 | | - *err = -EAGAIN; |
|---|
| 2575 | | - ret = 1; |
|---|
| 2576 | | - } |
|---|
| 2766 | + if (endoff > ci->i_requested_max_size) |
|---|
| 2767 | + ret = ci->i_auth_cap ? -EFBIG : -ESTALE; |
|---|
| 2577 | 2768 | goto out_unlock; |
|---|
| 2578 | 2769 | } |
|---|
| 2579 | 2770 | /* |
|---|
| .. | .. |
|---|
| 2607 | 2798 | * we can not call down_read() when |
|---|
| 2608 | 2799 | * task isn't in TASK_RUNNING state |
|---|
| 2609 | 2800 | */ |
|---|
| 2610 | | - if (nonblock) { |
|---|
| 2611 | | - *err = -EAGAIN; |
|---|
| 2612 | | - ret = 1; |
|---|
| 2801 | + if (flags & NON_BLOCKING) { |
|---|
| 2802 | + ret = -EAGAIN; |
|---|
| 2613 | 2803 | goto out_unlock; |
|---|
| 2614 | 2804 | } |
|---|
| 2615 | 2805 | |
|---|
| .. | .. |
|---|
| 2620 | 2810 | } |
|---|
| 2621 | 2811 | snap_rwsem_locked = true; |
|---|
| 2622 | 2812 | } |
|---|
| 2623 | | - *got = need | (have & want); |
|---|
| 2624 | | - if ((need & CEPH_CAP_FILE_RD) && |
|---|
| 2813 | + if ((have & want) == want) |
|---|
| 2814 | + *got = need | want; |
|---|
| 2815 | + else |
|---|
| 2816 | + *got = need; |
|---|
| 2817 | + if (S_ISREG(inode->i_mode) && |
|---|
| 2818 | + (need & CEPH_CAP_FILE_RD) && |
|---|
| 2625 | 2819 | !(*got & CEPH_CAP_FILE_CACHE)) |
|---|
| 2626 | 2820 | ceph_disable_fscache_readpage(ci); |
|---|
| 2627 | | - __take_cap_refs(ci, *got, true); |
|---|
| 2821 | + ceph_take_cap_refs(ci, *got, true); |
|---|
| 2628 | 2822 | ret = 1; |
|---|
| 2629 | 2823 | } |
|---|
| 2630 | 2824 | } else { |
|---|
| 2631 | 2825 | int session_readonly = false; |
|---|
| 2632 | | - if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { |
|---|
| 2826 | + int mds_wanted; |
|---|
| 2827 | + if (ci->i_auth_cap && |
|---|
| 2828 | + (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { |
|---|
| 2633 | 2829 | struct ceph_mds_session *s = ci->i_auth_cap->session; |
|---|
| 2634 | 2830 | spin_lock(&s->s_cap_lock); |
|---|
| 2635 | 2831 | session_readonly = s->s_readonly; |
|---|
| 2636 | 2832 | spin_unlock(&s->s_cap_lock); |
|---|
| 2637 | 2833 | } |
|---|
| 2638 | 2834 | if (session_readonly) { |
|---|
| 2639 | | - dout("get_cap_refs %p needed %s but mds%d readonly\n", |
|---|
| 2835 | + dout("get_cap_refs %p need %s but mds%d readonly\n", |
|---|
| 2640 | 2836 | inode, ceph_cap_string(need), ci->i_auth_cap->mds); |
|---|
| 2641 | | - *err = -EROFS; |
|---|
| 2642 | | - ret = 1; |
|---|
| 2837 | + ret = -EROFS; |
|---|
| 2643 | 2838 | goto out_unlock; |
|---|
| 2644 | 2839 | } |
|---|
| 2645 | 2840 | |
|---|
| 2646 | | - if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { |
|---|
| 2647 | | - int mds_wanted; |
|---|
| 2648 | | - if (READ_ONCE(mdsc->fsc->mount_state) == |
|---|
| 2649 | | - CEPH_MOUNT_SHUTDOWN) { |
|---|
| 2650 | | - dout("get_cap_refs %p forced umount\n", inode); |
|---|
| 2651 | | - *err = -EIO; |
|---|
| 2652 | | - ret = 1; |
|---|
| 2653 | | - goto out_unlock; |
|---|
| 2654 | | - } |
|---|
| 2655 | | - mds_wanted = __ceph_caps_mds_wanted(ci, false); |
|---|
| 2656 | | - if (need & ~(mds_wanted & need)) { |
|---|
| 2657 | | - dout("get_cap_refs %p caps were dropped" |
|---|
| 2658 | | - " (session killed?)\n", inode); |
|---|
| 2659 | | - *err = -ESTALE; |
|---|
| 2660 | | - ret = 1; |
|---|
| 2661 | | - goto out_unlock; |
|---|
| 2662 | | - } |
|---|
| 2663 | | - if (!(file_wanted & ~mds_wanted)) |
|---|
| 2664 | | - ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; |
|---|
| 2841 | + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
|---|
| 2842 | + dout("get_cap_refs %p forced umount\n", inode); |
|---|
| 2843 | + ret = -EIO; |
|---|
| 2844 | + goto out_unlock; |
|---|
| 2845 | + } |
|---|
| 2846 | + mds_wanted = __ceph_caps_mds_wanted(ci, false); |
|---|
| 2847 | + if (need & ~mds_wanted) { |
|---|
| 2848 | + dout("get_cap_refs %p need %s > mds_wanted %s\n", |
|---|
| 2849 | + inode, ceph_cap_string(need), |
|---|
| 2850 | + ceph_cap_string(mds_wanted)); |
|---|
| 2851 | + ret = -ESTALE; |
|---|
| 2852 | + goto out_unlock; |
|---|
| 2665 | 2853 | } |
|---|
| 2666 | 2854 | |
|---|
| 2667 | | - dout("get_cap_refs %p have %s needed %s\n", inode, |
|---|
| 2855 | + dout("get_cap_refs %p have %s need %s\n", inode, |
|---|
| 2668 | 2856 | ceph_cap_string(have), ceph_cap_string(need)); |
|---|
| 2669 | 2857 | } |
|---|
| 2670 | 2858 | out_unlock: |
|---|
| 2859 | + |
|---|
| 2860 | + __ceph_touch_fmode(ci, mdsc, flags); |
|---|
| 2861 | + |
|---|
| 2671 | 2862 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 2672 | 2863 | if (snap_rwsem_locked) |
|---|
| 2673 | 2864 | up_read(&mdsc->snap_rwsem); |
|---|
| 2865 | + |
|---|
| 2866 | + if (!ret) |
|---|
| 2867 | + ceph_update_cap_mis(&mdsc->metric); |
|---|
| 2868 | + else if (ret == 1) |
|---|
| 2869 | + ceph_update_cap_hit(&mdsc->metric); |
|---|
| 2674 | 2870 | |
|---|
| 2675 | 2871 | dout("get_cap_refs %p ret %d got %s\n", inode, |
|---|
| 2676 | 2872 | ret, ceph_cap_string(*got)); |
|---|
| .. | .. |
|---|
| 2705 | 2901 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
|---|
| 2706 | 2902 | } |
|---|
| 2707 | 2903 | |
|---|
| 2708 | | -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) |
|---|
| 2904 | +static inline int get_used_fmode(int caps) |
|---|
| 2709 | 2905 | { |
|---|
| 2710 | | - int ret, err = 0; |
|---|
| 2906 | + int fmode = 0; |
|---|
| 2907 | + if (caps & CEPH_CAP_FILE_RD) |
|---|
| 2908 | + fmode |= CEPH_FILE_MODE_RD; |
|---|
| 2909 | + if (caps & CEPH_CAP_FILE_WR) |
|---|
| 2910 | + fmode |= CEPH_FILE_MODE_WR; |
|---|
| 2911 | + return fmode; |
|---|
| 2912 | +} |
|---|
| 2913 | + |
|---|
| 2914 | +int ceph_try_get_caps(struct inode *inode, int need, int want, |
|---|
| 2915 | + bool nonblock, int *got) |
|---|
| 2916 | +{ |
|---|
| 2917 | + int ret, flags; |
|---|
| 2711 | 2918 | |
|---|
| 2712 | 2919 | BUG_ON(need & ~CEPH_CAP_FILE_RD); |
|---|
| 2713 | | - BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); |
|---|
| 2714 | | - ret = ceph_pool_perm_check(ci, need); |
|---|
| 2715 | | - if (ret < 0) |
|---|
| 2716 | | - return ret; |
|---|
| 2717 | | - |
|---|
| 2718 | | - ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); |
|---|
| 2719 | | - if (ret) { |
|---|
| 2720 | | - if (err == -EAGAIN) { |
|---|
| 2721 | | - ret = 0; |
|---|
| 2722 | | - } else if (err < 0) { |
|---|
| 2723 | | - ret = err; |
|---|
| 2724 | | - } |
|---|
| 2920 | + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | |
|---|
| 2921 | + CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | |
|---|
| 2922 | + CEPH_CAP_ANY_DIR_OPS)); |
|---|
| 2923 | + if (need) { |
|---|
| 2924 | + ret = ceph_pool_perm_check(inode, need); |
|---|
| 2925 | + if (ret < 0) |
|---|
| 2926 | + return ret; |
|---|
| 2725 | 2927 | } |
|---|
| 2928 | + |
|---|
| 2929 | + flags = get_used_fmode(need | want); |
|---|
| 2930 | + if (nonblock) |
|---|
| 2931 | + flags |= NON_BLOCKING; |
|---|
| 2932 | + |
|---|
| 2933 | + ret = try_get_cap_refs(inode, need, want, 0, flags, got); |
|---|
| 2934 | + /* three special error codes */ |
|---|
| 2935 | + if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) |
|---|
| 2936 | + ret = 0; |
|---|
| 2726 | 2937 | return ret; |
|---|
| 2727 | 2938 | } |
|---|
| 2728 | 2939 | |
|---|
| .. | .. |
|---|
| 2731 | 2942 | * due to a small max_size, make sure we check_max_size (and possibly |
|---|
| 2732 | 2943 | * ask the mds) so we don't get hung up indefinitely. |
|---|
| 2733 | 2944 | */ |
|---|
| 2734 | | -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, |
|---|
| 2945 | +int ceph_get_caps(struct file *filp, int need, int want, |
|---|
| 2735 | 2946 | loff_t endoff, int *got, struct page **pinned_page) |
|---|
| 2736 | 2947 | { |
|---|
| 2737 | | - int _got, ret, err = 0; |
|---|
| 2948 | + struct ceph_file_info *fi = filp->private_data; |
|---|
| 2949 | + struct inode *inode = file_inode(filp); |
|---|
| 2950 | + struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 2951 | + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
|---|
| 2952 | + int ret, _got, flags; |
|---|
| 2738 | 2953 | |
|---|
| 2739 | | - ret = ceph_pool_perm_check(ci, need); |
|---|
| 2954 | + ret = ceph_pool_perm_check(inode, need); |
|---|
| 2740 | 2955 | if (ret < 0) |
|---|
| 2741 | 2956 | return ret; |
|---|
| 2742 | 2957 | |
|---|
| 2743 | | - while (true) { |
|---|
| 2744 | | - if (endoff > 0) |
|---|
| 2745 | | - check_max_size(&ci->vfs_inode, endoff); |
|---|
| 2958 | + if ((fi->fmode & CEPH_FILE_MODE_WR) && |
|---|
| 2959 | + fi->filp_gen != READ_ONCE(fsc->filp_gen)) |
|---|
| 2960 | + return -EBADF; |
|---|
| 2746 | 2961 | |
|---|
| 2747 | | - err = 0; |
|---|
| 2962 | + flags = get_used_fmode(need | want); |
|---|
| 2963 | + |
|---|
| 2964 | + while (true) { |
|---|
| 2965 | + flags &= CEPH_FILE_MODE_MASK; |
|---|
| 2966 | + if (vfs_inode_has_locks(inode)) |
|---|
| 2967 | + flags |= CHECK_FILELOCK; |
|---|
| 2748 | 2968 | _got = 0; |
|---|
| 2749 | | - ret = try_get_cap_refs(ci, need, want, endoff, |
|---|
| 2750 | | - false, &_got, &err); |
|---|
| 2751 | | - if (ret) { |
|---|
| 2752 | | - if (err == -EAGAIN) |
|---|
| 2753 | | - continue; |
|---|
| 2754 | | - if (err < 0) |
|---|
| 2755 | | - ret = err; |
|---|
| 2756 | | - } else { |
|---|
| 2969 | + ret = try_get_cap_refs(inode, need, want, endoff, |
|---|
| 2970 | + flags, &_got); |
|---|
| 2971 | + WARN_ON_ONCE(ret == -EAGAIN); |
|---|
| 2972 | + if (!ret) { |
|---|
| 2973 | + struct ceph_mds_client *mdsc = fsc->mdsc; |
|---|
| 2974 | + struct cap_wait cw; |
|---|
| 2757 | 2975 | DEFINE_WAIT_FUNC(wait, woken_wake_function); |
|---|
| 2976 | + |
|---|
| 2977 | + cw.ino = ceph_ino(inode); |
|---|
| 2978 | + cw.tgid = current->tgid; |
|---|
| 2979 | + cw.need = need; |
|---|
| 2980 | + cw.want = want; |
|---|
| 2981 | + |
|---|
| 2982 | + spin_lock(&mdsc->caps_list_lock); |
|---|
| 2983 | + list_add(&cw.list, &mdsc->cap_wait_list); |
|---|
| 2984 | + spin_unlock(&mdsc->caps_list_lock); |
|---|
| 2985 | + |
|---|
| 2986 | + /* make sure used fmode not timeout */ |
|---|
| 2987 | + ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); |
|---|
| 2758 | 2988 | add_wait_queue(&ci->i_cap_wq, &wait); |
|---|
| 2759 | 2989 | |
|---|
| 2760 | | - while (!try_get_cap_refs(ci, need, want, endoff, |
|---|
| 2761 | | - true, &_got, &err)) { |
|---|
| 2990 | + flags |= NON_BLOCKING; |
|---|
| 2991 | + while (!(ret = try_get_cap_refs(inode, need, want, |
|---|
| 2992 | + endoff, flags, &_got))) { |
|---|
| 2762 | 2993 | if (signal_pending(current)) { |
|---|
| 2763 | 2994 | ret = -ERESTARTSYS; |
|---|
| 2764 | 2995 | break; |
|---|
| .. | .. |
|---|
| 2767 | 2998 | } |
|---|
| 2768 | 2999 | |
|---|
| 2769 | 3000 | remove_wait_queue(&ci->i_cap_wq, &wait); |
|---|
| 3001 | + ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); |
|---|
| 2770 | 3002 | |
|---|
| 2771 | | - if (err == -EAGAIN) |
|---|
| 3003 | + spin_lock(&mdsc->caps_list_lock); |
|---|
| 3004 | + list_del(&cw.list); |
|---|
| 3005 | + spin_unlock(&mdsc->caps_list_lock); |
|---|
| 3006 | + |
|---|
| 3007 | + if (ret == -EAGAIN) |
|---|
| 2772 | 3008 | continue; |
|---|
| 2773 | | - if (err < 0) |
|---|
| 2774 | | - ret = err; |
|---|
| 2775 | 3009 | } |
|---|
| 3010 | + |
|---|
| 3011 | + if ((fi->fmode & CEPH_FILE_MODE_WR) && |
|---|
| 3012 | + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { |
|---|
| 3013 | + if (ret >= 0 && _got) |
|---|
| 3014 | + ceph_put_cap_refs(ci, _got); |
|---|
| 3015 | + return -EBADF; |
|---|
| 3016 | + } |
|---|
| 3017 | + |
|---|
| 2776 | 3018 | if (ret < 0) { |
|---|
| 2777 | | - if (err == -ESTALE) { |
|---|
| 3019 | + if (ret == -EFBIG || ret == -ESTALE) { |
|---|
| 3020 | + int ret2 = ceph_wait_on_async_create(inode); |
|---|
| 3021 | + if (ret2 < 0) |
|---|
| 3022 | + return ret2; |
|---|
| 3023 | + } |
|---|
| 3024 | + if (ret == -EFBIG) { |
|---|
| 3025 | + check_max_size(inode, endoff); |
|---|
| 3026 | + continue; |
|---|
| 3027 | + } |
|---|
| 3028 | + if (ret == -ESTALE) { |
|---|
| 2778 | 3029 | /* session was killed, try renew caps */ |
|---|
| 2779 | | - ret = ceph_renew_caps(&ci->vfs_inode); |
|---|
| 3030 | + ret = ceph_renew_caps(inode, flags); |
|---|
| 2780 | 3031 | if (ret == 0) |
|---|
| 2781 | 3032 | continue; |
|---|
| 2782 | 3033 | } |
|---|
| 2783 | 3034 | return ret; |
|---|
| 2784 | 3035 | } |
|---|
| 2785 | 3036 | |
|---|
| 2786 | | - if (ci->i_inline_version != CEPH_INLINE_NONE && |
|---|
| 3037 | + if (S_ISREG(ci->vfs_inode.i_mode) && |
|---|
| 3038 | + ci->i_inline_version != CEPH_INLINE_NONE && |
|---|
| 2787 | 3039 | (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
|---|
| 2788 | | - i_size_read(&ci->vfs_inode) > 0) { |
|---|
| 3040 | + i_size_read(inode) > 0) { |
|---|
| 2789 | 3041 | struct page *page = |
|---|
| 2790 | | - find_get_page(ci->vfs_inode.i_mapping, 0); |
|---|
| 3042 | + find_get_page(inode->i_mapping, 0); |
|---|
| 2791 | 3043 | if (page) { |
|---|
| 2792 | 3044 | if (PageUptodate(page)) { |
|---|
| 2793 | 3045 | *pinned_page = page; |
|---|
| .. | .. |
|---|
| 2806 | 3058 | * getattr request will bring inline data into |
|---|
| 2807 | 3059 | * page cache |
|---|
| 2808 | 3060 | */ |
|---|
| 2809 | | - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, |
|---|
| 3061 | + ret = __ceph_do_getattr(inode, NULL, |
|---|
| 2810 | 3062 | CEPH_STAT_CAP_INLINE_DATA, |
|---|
| 2811 | 3063 | true); |
|---|
| 2812 | 3064 | if (ret < 0) |
|---|
| .. | .. |
|---|
| 2816 | 3068 | break; |
|---|
| 2817 | 3069 | } |
|---|
| 2818 | 3070 | |
|---|
| 2819 | | - if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) |
|---|
| 3071 | + if (S_ISREG(ci->vfs_inode.i_mode) && |
|---|
| 3072 | + (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) |
|---|
| 2820 | 3073 | ceph_fscache_revalidate_cookie(ci); |
|---|
| 2821 | 3074 | |
|---|
| 2822 | 3075 | *got = _got; |
|---|
| .. | .. |
|---|
| 2830 | 3083 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) |
|---|
| 2831 | 3084 | { |
|---|
| 2832 | 3085 | spin_lock(&ci->i_ceph_lock); |
|---|
| 2833 | | - __take_cap_refs(ci, caps, false); |
|---|
| 3086 | + ceph_take_cap_refs(ci, caps, false); |
|---|
| 2834 | 3087 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 2835 | 3088 | } |
|---|
| 2836 | 3089 | |
|---|
| .. | .. |
|---|
| 2867 | 3120 | * If we are releasing a WR cap (from a sync write), finalize any affected |
|---|
| 2868 | 3121 | * cap_snap, and wake up any waiters. |
|---|
| 2869 | 3122 | */ |
|---|
| 2870 | | -void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) |
|---|
| 3123 | +static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, |
|---|
| 3124 | + bool skip_checking_caps) |
|---|
| 2871 | 3125 | { |
|---|
| 2872 | 3126 | struct inode *inode = &ci->vfs_inode; |
|---|
| 2873 | 3127 | int last = 0, put = 0, flushsnaps = 0, wake = 0; |
|---|
| .. | .. |
|---|
| 2880 | 3134 | last++; |
|---|
| 2881 | 3135 | if (had & CEPH_CAP_FILE_CACHE) |
|---|
| 2882 | 3136 | if (--ci->i_rdcache_ref == 0) |
|---|
| 3137 | + last++; |
|---|
| 3138 | + if (had & CEPH_CAP_FILE_EXCL) |
|---|
| 3139 | + if (--ci->i_fx_ref == 0) |
|---|
| 2883 | 3140 | last++; |
|---|
| 2884 | 3141 | if (had & CEPH_CAP_FILE_BUFFER) { |
|---|
| 2885 | 3142 | if (--ci->i_wb_ref == 0) { |
|---|
| .. | .. |
|---|
| 2912 | 3169 | ci->i_head_snapc = NULL; |
|---|
| 2913 | 3170 | } |
|---|
| 2914 | 3171 | /* see comment in __ceph_remove_cap() */ |
|---|
| 2915 | | - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) |
|---|
| 3172 | + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) |
|---|
| 2916 | 3173 | drop_inode_snap_realm(ci); |
|---|
| 2917 | 3174 | } |
|---|
| 2918 | 3175 | spin_unlock(&ci->i_ceph_lock); |
|---|
| .. | .. |
|---|
| 2920 | 3177 | dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), |
|---|
| 2921 | 3178 | last ? " last" : "", put ? " put" : ""); |
|---|
| 2922 | 3179 | |
|---|
| 2923 | | - if (last && !flushsnaps) |
|---|
| 2924 | | - ceph_check_caps(ci, 0, NULL); |
|---|
| 2925 | | - else if (flushsnaps) |
|---|
| 2926 | | - ceph_flush_snaps(ci, NULL); |
|---|
| 3180 | + if (!skip_checking_caps) { |
|---|
| 3181 | + if (last) |
|---|
| 3182 | + ceph_check_caps(ci, 0, NULL); |
|---|
| 3183 | + else if (flushsnaps) |
|---|
| 3184 | + ceph_flush_snaps(ci, NULL); |
|---|
| 3185 | + } |
|---|
| 2927 | 3186 | if (wake) |
|---|
| 2928 | 3187 | wake_up_all(&ci->i_cap_wq); |
|---|
| 2929 | 3188 | while (put-- > 0) |
|---|
| 2930 | 3189 | iput(inode); |
|---|
| 3190 | +} |
|---|
| 3191 | + |
|---|
| 3192 | +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) |
|---|
| 3193 | +{ |
|---|
| 3194 | + __ceph_put_cap_refs(ci, had, false); |
|---|
| 3195 | +} |
|---|
| 3196 | + |
|---|
| 3197 | +void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) |
|---|
| 3198 | +{ |
|---|
| 3199 | + __ceph_put_cap_refs(ci, had, true); |
|---|
| 2931 | 3200 | } |
|---|
| 2932 | 3201 | |
|---|
| 2933 | 3202 | /* |
|---|
| .. | .. |
|---|
| 2977 | 3246 | break; |
|---|
| 2978 | 3247 | } |
|---|
| 2979 | 3248 | } |
|---|
| 2980 | | - BUG_ON(!found); |
|---|
| 3249 | + |
|---|
| 3250 | + if (!found) { |
|---|
| 3251 | + /* |
|---|
| 3252 | + * The capsnap should already be removed when removing |
|---|
| 3253 | + * auth cap in the case of a forced unmount. |
|---|
| 3254 | + */ |
|---|
| 3255 | + WARN_ON_ONCE(ci->i_auth_cap); |
|---|
| 3256 | + goto unlock; |
|---|
| 3257 | + } |
|---|
| 3258 | + |
|---|
| 2981 | 3259 | capsnap->dirty_pages -= nr; |
|---|
| 2982 | 3260 | if (capsnap->dirty_pages == 0) { |
|---|
| 2983 | 3261 | complete_capsnap = true; |
|---|
| .. | .. |
|---|
| 2999 | 3277 | complete_capsnap ? " (complete capsnap)" : ""); |
|---|
| 3000 | 3278 | } |
|---|
| 3001 | 3279 | |
|---|
| 3280 | +unlock: |
|---|
| 3002 | 3281 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 3003 | 3282 | |
|---|
| 3004 | 3283 | if (last) { |
|---|
| 3005 | | - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
|---|
| 3284 | + ceph_check_caps(ci, 0, NULL); |
|---|
| 3006 | 3285 | } else if (flush_snaps) { |
|---|
| 3007 | 3286 | ceph_flush_snaps(ci, NULL); |
|---|
| 3008 | 3287 | } |
|---|
| 3009 | 3288 | if (complete_capsnap) |
|---|
| 3010 | 3289 | wake_up_all(&ci->i_cap_wq); |
|---|
| 3011 | | - while (put-- > 0) |
|---|
| 3012 | | - iput(inode); |
|---|
| 3290 | + while (put-- > 0) { |
|---|
| 3291 | + /* avoid calling iput_final() in osd dispatch threads */ |
|---|
| 3292 | + ceph_async_iput(inode); |
|---|
| 3293 | + } |
|---|
| 3013 | 3294 | } |
|---|
| 3014 | 3295 | |
|---|
| 3015 | 3296 | /* |
|---|
| .. | .. |
|---|
| 3054 | 3335 | bool dirstat_valid; |
|---|
| 3055 | 3336 | u64 nfiles; |
|---|
| 3056 | 3337 | u64 nsubdirs; |
|---|
| 3338 | + u64 change_attr; |
|---|
| 3057 | 3339 | /* currently issued */ |
|---|
| 3058 | 3340 | int issued; |
|---|
| 3341 | + struct timespec64 btime; |
|---|
| 3059 | 3342 | }; |
|---|
| 3060 | 3343 | |
|---|
| 3061 | 3344 | /* |
|---|
| .. | .. |
|---|
| 3079 | 3362 | int used, wanted, dirty; |
|---|
| 3080 | 3363 | u64 size = le64_to_cpu(grant->size); |
|---|
| 3081 | 3364 | u64 max_size = le64_to_cpu(grant->max_size); |
|---|
| 3082 | | - int check_caps = 0; |
|---|
| 3365 | + unsigned char check_caps = 0; |
|---|
| 3366 | + bool was_stale = cap->cap_gen < session->s_cap_gen; |
|---|
| 3083 | 3367 | bool wake = false; |
|---|
| 3084 | 3368 | bool writeback = false; |
|---|
| 3085 | 3369 | bool queue_trunc = false; |
|---|
| .. | .. |
|---|
| 3092 | 3376 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, |
|---|
| 3093 | 3377 | inode->i_size); |
|---|
| 3094 | 3378 | |
|---|
| 3379 | + |
|---|
| 3380 | + /* |
|---|
| 3381 | + * If CACHE is being revoked, and we have no dirty buffers, |
|---|
| 3382 | + * try to invalidate (once). (If there are dirty buffers, we |
|---|
| 3383 | + * will invalidate _after_ writeback.) |
|---|
| 3384 | + */ |
|---|
| 3385 | + if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ |
|---|
| 3386 | + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && |
|---|
| 3387 | + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
|---|
| 3388 | + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { |
|---|
| 3389 | + if (try_nonblocking_invalidate(inode)) { |
|---|
| 3390 | + /* there were locked pages.. invalidate later |
|---|
| 3391 | + in a separate thread. */ |
|---|
| 3392 | + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { |
|---|
| 3393 | + queue_invalidate = true; |
|---|
| 3394 | + ci->i_rdcache_revoking = ci->i_rdcache_gen; |
|---|
| 3395 | + } |
|---|
| 3396 | + } |
|---|
| 3397 | + } |
|---|
| 3398 | + |
|---|
| 3399 | + if (was_stale) |
|---|
| 3400 | + cap->issued = cap->implemented = CEPH_CAP_PIN; |
|---|
| 3095 | 3401 | |
|---|
| 3096 | 3402 | /* |
|---|
| 3097 | 3403 | * auth mds of the inode changed. we received the cap export message, |
|---|
| .. | .. |
|---|
| 3108 | 3414 | newcaps |= cap->issued; |
|---|
| 3109 | 3415 | } |
|---|
| 3110 | 3416 | |
|---|
| 3111 | | - /* |
|---|
| 3112 | | - * If CACHE is being revoked, and we have no dirty buffers, |
|---|
| 3113 | | - * try to invalidate (once). (If there are dirty buffers, we |
|---|
| 3114 | | - * will invalidate _after_ writeback.) |
|---|
| 3115 | | - */ |
|---|
| 3116 | | - if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ |
|---|
| 3117 | | - ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && |
|---|
| 3118 | | - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
|---|
| 3119 | | - !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { |
|---|
| 3120 | | - if (try_nonblocking_invalidate(inode)) { |
|---|
| 3121 | | - /* there were locked pages.. invalidate later |
|---|
| 3122 | | - in a separate thread. */ |
|---|
| 3123 | | - if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { |
|---|
| 3124 | | - queue_invalidate = true; |
|---|
| 3125 | | - ci->i_rdcache_revoking = ci->i_rdcache_gen; |
|---|
| 3126 | | - } |
|---|
| 3127 | | - } |
|---|
| 3128 | | - } |
|---|
| 3129 | | - |
|---|
| 3130 | 3417 | /* side effects now are allowed */ |
|---|
| 3131 | 3418 | cap->cap_gen = session->s_cap_gen; |
|---|
| 3132 | 3419 | cap->seq = seq; |
|---|
| 3133 | 3420 | |
|---|
| 3134 | 3421 | __check_cap_issue(ci, cap, newcaps); |
|---|
| 3135 | 3422 | |
|---|
| 3423 | + inode_set_max_iversion_raw(inode, extra_info->change_attr); |
|---|
| 3424 | + |
|---|
| 3136 | 3425 | if ((newcaps & CEPH_CAP_AUTH_SHARED) && |
|---|
| 3137 | 3426 | (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { |
|---|
| 3138 | 3427 | inode->i_mode = le32_to_cpu(grant->mode); |
|---|
| 3139 | 3428 | inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); |
|---|
| 3140 | 3429 | inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); |
|---|
| 3430 | + ci->i_btime = extra_info->btime; |
|---|
| 3141 | 3431 | dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, |
|---|
| 3142 | 3432 | from_kuid(&init_user_ns, inode->i_uid), |
|---|
| 3143 | 3433 | from_kgid(&init_user_ns, inode->i_gid)); |
|---|
| .. | .. |
|---|
| 3164 | 3454 | ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); |
|---|
| 3165 | 3455 | ci->i_xattrs.version = version; |
|---|
| 3166 | 3456 | ceph_forget_all_cached_acls(inode); |
|---|
| 3457 | + ceph_security_invalidate_secctx(inode); |
|---|
| 3167 | 3458 | } |
|---|
| 3168 | 3459 | } |
|---|
| 3169 | 3460 | |
|---|
| .. | .. |
|---|
| 3216 | 3507 | ci->i_requested_max_size = 0; |
|---|
| 3217 | 3508 | } |
|---|
| 3218 | 3509 | wake = true; |
|---|
| 3219 | | - } else if (ci->i_wanted_max_size > ci->i_max_size && |
|---|
| 3220 | | - ci->i_wanted_max_size > ci->i_requested_max_size) { |
|---|
| 3221 | | - /* CEPH_CAP_OP_IMPORT */ |
|---|
| 3222 | | - wake = true; |
|---|
| 3223 | 3510 | } |
|---|
| 3224 | 3511 | } |
|---|
| 3225 | 3512 | |
|---|
| .. | .. |
|---|
| 3231 | 3518 | ceph_cap_string(wanted), |
|---|
| 3232 | 3519 | ceph_cap_string(used), |
|---|
| 3233 | 3520 | ceph_cap_string(dirty)); |
|---|
| 3234 | | - if (wanted != le32_to_cpu(grant->wanted)) { |
|---|
| 3235 | | - dout("mds wanted %s -> %s\n", |
|---|
| 3236 | | - ceph_cap_string(le32_to_cpu(grant->wanted)), |
|---|
| 3237 | | - ceph_cap_string(wanted)); |
|---|
| 3238 | | - /* imported cap may not have correct mds_wanted */ |
|---|
| 3239 | | - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) |
|---|
| 3240 | | - check_caps = 1; |
|---|
| 3521 | + |
|---|
| 3522 | + if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && |
|---|
| 3523 | + (wanted & ~(cap->mds_wanted | newcaps))) { |
|---|
| 3524 | + /* |
|---|
| 3525 | + * If mds is importing cap, prior cap messages that update |
|---|
| 3526 | + * 'wanted' may get dropped by mds (migrate seq mismatch). |
|---|
| 3527 | + * |
|---|
| 3528 | + * We don't send cap message to update 'wanted' if what we |
|---|
| 3529 | + * want are already issued. If mds revokes caps, cap message |
|---|
| 3530 | + * that releases caps also tells mds what we want. But if |
|---|
| 3531 | + * caps got revoked by mds forcedly (session stale). We may |
|---|
| 3532 | + * haven't told mds what we want. |
|---|
| 3533 | + */ |
|---|
| 3534 | + check_caps = 1; |
|---|
| 3241 | 3535 | } |
|---|
| 3242 | 3536 | |
|---|
| 3243 | 3537 | /* revocation, grant, or no-op? */ |
|---|
| .. | .. |
|---|
| 3248 | 3542 | ceph_cap_string(cap->issued), |
|---|
| 3249 | 3543 | ceph_cap_string(newcaps), |
|---|
| 3250 | 3544 | ceph_cap_string(revoking)); |
|---|
| 3251 | | - if (revoking & used & CEPH_CAP_FILE_BUFFER) |
|---|
| 3545 | + if (S_ISREG(inode->i_mode) && |
|---|
| 3546 | + (revoking & used & CEPH_CAP_FILE_BUFFER)) |
|---|
| 3252 | 3547 | writeback = true; /* initiate writeback; will delay ack */ |
|---|
| 3253 | | - else if (revoking == CEPH_CAP_FILE_CACHE && |
|---|
| 3254 | | - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
|---|
| 3255 | | - queue_invalidate) |
|---|
| 3548 | + else if (queue_invalidate && |
|---|
| 3549 | + revoking == CEPH_CAP_FILE_CACHE && |
|---|
| 3550 | + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) |
|---|
| 3256 | 3551 | ; /* do nothing yet, invalidation will be queued */ |
|---|
| 3257 | 3552 | else if (cap == ci->i_auth_cap) |
|---|
| 3258 | 3553 | check_caps = 1; /* check auth cap only */ |
|---|
| .. | .. |
|---|
| 3279 | 3574 | } |
|---|
| 3280 | 3575 | BUG_ON(cap->issued & ~cap->implemented); |
|---|
| 3281 | 3576 | |
|---|
| 3577 | + /* don't let check_caps skip sending a response to MDS for revoke msgs */ |
|---|
| 3578 | + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { |
|---|
| 3579 | + cap->mds_wanted = 0; |
|---|
| 3580 | + if (cap == ci->i_auth_cap) |
|---|
| 3581 | + check_caps = 1; /* check auth cap only */ |
|---|
| 3582 | + else |
|---|
| 3583 | + check_caps = 2; /* check all caps */ |
|---|
| 3584 | + } |
|---|
| 3585 | + |
|---|
| 3282 | 3586 | if (extra_info->inline_version > 0 && |
|---|
| 3283 | 3587 | extra_info->inline_version >= ci->i_inline_version) { |
|---|
| 3284 | 3588 | ci->i_inline_version = extra_info->inline_version; |
|---|
| .. | .. |
|---|
| 3288 | 3592 | } |
|---|
| 3289 | 3593 | |
|---|
| 3290 | 3594 | if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { |
|---|
| 3291 | | - if (newcaps & ~extra_info->issued) |
|---|
| 3292 | | - wake = true; |
|---|
| 3293 | | - kick_flushing_inode_caps(session->s_mdsc, session, inode); |
|---|
| 3595 | + if (ci->i_auth_cap == cap) { |
|---|
| 3596 | + if (newcaps & ~extra_info->issued) |
|---|
| 3597 | + wake = true; |
|---|
| 3598 | + |
|---|
| 3599 | + if (ci->i_requested_max_size > max_size || |
|---|
| 3600 | + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { |
|---|
| 3601 | + /* re-request max_size if necessary */ |
|---|
| 3602 | + ci->i_requested_max_size = 0; |
|---|
| 3603 | + wake = true; |
|---|
| 3604 | + } |
|---|
| 3605 | + |
|---|
| 3606 | + ceph_kick_flushing_inode_caps(session, ci); |
|---|
| 3607 | + } |
|---|
| 3294 | 3608 | up_read(&session->s_mdsc->snap_rwsem); |
|---|
| 3295 | | - } else { |
|---|
| 3296 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 3297 | 3609 | } |
|---|
| 3610 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 3298 | 3611 | |
|---|
| 3299 | 3612 | if (fill_inline) |
|---|
| 3300 | 3613 | ceph_fill_inline_data(inode, NULL, extra_info->inline_data, |
|---|
| .. | .. |
|---|
| 3318 | 3631 | wake_up_all(&ci->i_cap_wq); |
|---|
| 3319 | 3632 | |
|---|
| 3320 | 3633 | if (check_caps == 1) |
|---|
| 3321 | | - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, |
|---|
| 3634 | + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, |
|---|
| 3322 | 3635 | session); |
|---|
| 3323 | 3636 | else if (check_caps == 2) |
|---|
| 3324 | | - ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); |
|---|
| 3637 | + ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); |
|---|
| 3325 | 3638 | else |
|---|
| 3326 | 3639 | mutex_unlock(&session->s_mutex); |
|---|
| 3327 | 3640 | } |
|---|
| .. | .. |
|---|
| 3348 | 3661 | bool wake_mdsc = false; |
|---|
| 3349 | 3662 | |
|---|
| 3350 | 3663 | list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { |
|---|
| 3664 | + /* Is this the one that was flushed? */ |
|---|
| 3351 | 3665 | if (cf->tid == flush_tid) |
|---|
| 3352 | 3666 | cleaned = cf->caps; |
|---|
| 3353 | | - if (cf->caps == 0) /* capsnap */ |
|---|
| 3667 | + |
|---|
| 3668 | + /* Is this a capsnap? */ |
|---|
| 3669 | + if (cf->is_capsnap) |
|---|
| 3354 | 3670 | continue; |
|---|
| 3671 | + |
|---|
| 3355 | 3672 | if (cf->tid <= flush_tid) { |
|---|
| 3356 | | - if (__finish_cap_flush(NULL, ci, cf)) |
|---|
| 3357 | | - wake_ci = true; |
|---|
| 3673 | + /* |
|---|
| 3674 | + * An earlier or current tid. The FLUSH_ACK should |
|---|
| 3675 | + * represent a superset of this flush's caps. |
|---|
| 3676 | + */ |
|---|
| 3677 | + wake_ci |= __detach_cap_flush_from_ci(ci, cf); |
|---|
| 3358 | 3678 | list_add_tail(&cf->i_list, &to_remove); |
|---|
| 3359 | 3679 | } else { |
|---|
| 3680 | + /* |
|---|
| 3681 | + * This is a later one. Any caps in it are still dirty |
|---|
| 3682 | + * so don't count them as cleaned. |
|---|
| 3683 | + */ |
|---|
| 3360 | 3684 | cleaned &= ~cf->caps; |
|---|
| 3361 | 3685 | if (!cleaned) |
|---|
| 3362 | 3686 | break; |
|---|
| .. | .. |
|---|
| 3376 | 3700 | |
|---|
| 3377 | 3701 | spin_lock(&mdsc->cap_dirty_lock); |
|---|
| 3378 | 3702 | |
|---|
| 3379 | | - list_for_each_entry(cf, &to_remove, i_list) { |
|---|
| 3380 | | - if (__finish_cap_flush(mdsc, NULL, cf)) |
|---|
| 3381 | | - wake_mdsc = true; |
|---|
| 3382 | | - } |
|---|
| 3703 | + list_for_each_entry(cf, &to_remove, i_list) |
|---|
| 3704 | + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); |
|---|
| 3383 | 3705 | |
|---|
| 3384 | 3706 | if (ci->i_flushing_caps == 0) { |
|---|
| 3385 | 3707 | if (list_empty(&ci->i_cap_flush_list)) { |
|---|
| .. | .. |
|---|
| 3417 | 3739 | while (!list_empty(&to_remove)) { |
|---|
| 3418 | 3740 | cf = list_first_entry(&to_remove, |
|---|
| 3419 | 3741 | struct ceph_cap_flush, i_list); |
|---|
| 3420 | | - list_del(&cf->i_list); |
|---|
| 3421 | | - ceph_free_cap_flush(cf); |
|---|
| 3742 | + list_del_init(&cf->i_list); |
|---|
| 3743 | + if (!cf->is_capsnap) |
|---|
| 3744 | + ceph_free_cap_flush(cf); |
|---|
| 3422 | 3745 | } |
|---|
| 3423 | 3746 | |
|---|
| 3424 | 3747 | if (wake_ci) |
|---|
| .. | .. |
|---|
| 3427 | 3750 | wake_up_all(&mdsc->cap_flushing_wq); |
|---|
| 3428 | 3751 | if (drop) |
|---|
| 3429 | 3752 | iput(inode); |
|---|
| 3753 | +} |
|---|
| 3754 | + |
|---|
| 3755 | +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, |
|---|
| 3756 | + bool *wake_ci, bool *wake_mdsc) |
|---|
| 3757 | +{ |
|---|
| 3758 | + struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 3759 | + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
|---|
| 3760 | + bool ret; |
|---|
| 3761 | + |
|---|
| 3762 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 3763 | + |
|---|
| 3764 | + dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); |
|---|
| 3765 | + |
|---|
| 3766 | + list_del_init(&capsnap->ci_item); |
|---|
| 3767 | + ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); |
|---|
| 3768 | + if (wake_ci) |
|---|
| 3769 | + *wake_ci = ret; |
|---|
| 3770 | + |
|---|
| 3771 | + spin_lock(&mdsc->cap_dirty_lock); |
|---|
| 3772 | + if (list_empty(&ci->i_cap_flush_list)) |
|---|
| 3773 | + list_del_init(&ci->i_flushing_item); |
|---|
| 3774 | + |
|---|
| 3775 | + ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); |
|---|
| 3776 | + if (wake_mdsc) |
|---|
| 3777 | + *wake_mdsc = ret; |
|---|
| 3778 | + spin_unlock(&mdsc->cap_dirty_lock); |
|---|
| 3779 | +} |
|---|
| 3780 | + |
|---|
| 3781 | +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, |
|---|
| 3782 | + bool *wake_ci, bool *wake_mdsc) |
|---|
| 3783 | +{ |
|---|
| 3784 | + struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 3785 | + |
|---|
| 3786 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 3787 | + |
|---|
| 3788 | + WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); |
|---|
| 3789 | + __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); |
|---|
| 3430 | 3790 | } |
|---|
| 3431 | 3791 | |
|---|
| 3432 | 3792 | /* |
|---|
| .. | .. |
|---|
| 3466 | 3826 | capsnap, capsnap->follows); |
|---|
| 3467 | 3827 | } |
|---|
| 3468 | 3828 | } |
|---|
| 3469 | | - if (flushed) { |
|---|
| 3470 | | - WARN_ON(capsnap->dirty_pages || capsnap->writing); |
|---|
| 3471 | | - dout(" removing %p cap_snap %p follows %lld\n", |
|---|
| 3472 | | - inode, capsnap, follows); |
|---|
| 3473 | | - list_del(&capsnap->ci_item); |
|---|
| 3474 | | - if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush)) |
|---|
| 3475 | | - wake_ci = true; |
|---|
| 3476 | | - |
|---|
| 3477 | | - spin_lock(&mdsc->cap_dirty_lock); |
|---|
| 3478 | | - |
|---|
| 3479 | | - if (list_empty(&ci->i_cap_flush_list)) |
|---|
| 3480 | | - list_del_init(&ci->i_flushing_item); |
|---|
| 3481 | | - |
|---|
| 3482 | | - if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush)) |
|---|
| 3483 | | - wake_mdsc = true; |
|---|
| 3484 | | - |
|---|
| 3485 | | - spin_unlock(&mdsc->cap_dirty_lock); |
|---|
| 3486 | | - } |
|---|
| 3829 | + if (flushed) |
|---|
| 3830 | + ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); |
|---|
| 3487 | 3831 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 3832 | + |
|---|
| 3488 | 3833 | if (flushed) { |
|---|
| 3489 | 3834 | ceph_put_snap_context(capsnap->context); |
|---|
| 3490 | 3835 | ceph_put_cap_snap(capsnap); |
|---|
| .. | .. |
|---|
| 3501 | 3846 | * |
|---|
| 3502 | 3847 | * caller hold s_mutex. |
|---|
| 3503 | 3848 | */ |
|---|
| 3504 | | -static void handle_cap_trunc(struct inode *inode, |
|---|
| 3849 | +static bool handle_cap_trunc(struct inode *inode, |
|---|
| 3505 | 3850 | struct ceph_mds_caps *trunc, |
|---|
| 3506 | 3851 | struct ceph_mds_session *session) |
|---|
| 3507 | | - __releases(ci->i_ceph_lock) |
|---|
| 3508 | 3852 | { |
|---|
| 3509 | 3853 | struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 3510 | 3854 | int mds = session->s_mds; |
|---|
| .. | .. |
|---|
| 3515 | 3859 | int implemented = 0; |
|---|
| 3516 | 3860 | int dirty = __ceph_caps_dirty(ci); |
|---|
| 3517 | 3861 | int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); |
|---|
| 3518 | | - int queue_trunc = 0; |
|---|
| 3862 | + bool queue_trunc = false; |
|---|
| 3863 | + |
|---|
| 3864 | + lockdep_assert_held(&ci->i_ceph_lock); |
|---|
| 3519 | 3865 | |
|---|
| 3520 | 3866 | issued |= implemented | dirty; |
|---|
| 3521 | 3867 | |
|---|
| .. | .. |
|---|
| 3523 | 3869 | inode, mds, seq, truncate_size, truncate_seq); |
|---|
| 3524 | 3870 | queue_trunc = ceph_fill_file_size(inode, issued, |
|---|
| 3525 | 3871 | truncate_seq, truncate_size, size); |
|---|
| 3526 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 3527 | | - |
|---|
| 3528 | | - if (queue_trunc) |
|---|
| 3529 | | - ceph_queue_vmtruncate(inode); |
|---|
| 3872 | + return queue_trunc; |
|---|
| 3530 | 3873 | } |
|---|
| 3531 | 3874 | |
|---|
| 3532 | 3875 | /* |
|---|
| .. | .. |
|---|
| 3571 | 3914 | |
|---|
| 3572 | 3915 | if (target < 0) { |
|---|
| 3573 | 3916 | __ceph_remove_cap(cap, false); |
|---|
| 3574 | | - if (!ci->i_auth_cap) |
|---|
| 3575 | | - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; |
|---|
| 3576 | 3917 | goto out_unlock; |
|---|
| 3577 | 3918 | } |
|---|
| 3578 | 3919 | |
|---|
| .. | .. |
|---|
| 3602 | 3943 | tcap->issue_seq = t_seq - 1; |
|---|
| 3603 | 3944 | tcap->issued |= issued; |
|---|
| 3604 | 3945 | tcap->implemented |= issued; |
|---|
| 3605 | | - if (cap == ci->i_auth_cap) |
|---|
| 3946 | + if (cap == ci->i_auth_cap) { |
|---|
| 3606 | 3947 | ci->i_auth_cap = tcap; |
|---|
| 3607 | | - |
|---|
| 3608 | | - if (!list_empty(&ci->i_cap_flush_list) && |
|---|
| 3609 | | - ci->i_auth_cap == tcap) { |
|---|
| 3610 | | - spin_lock(&mdsc->cap_dirty_lock); |
|---|
| 3611 | | - list_move_tail(&ci->i_flushing_item, |
|---|
| 3612 | | - &tcap->session->s_cap_flushing); |
|---|
| 3613 | | - spin_unlock(&mdsc->cap_dirty_lock); |
|---|
| 3948 | + change_auth_cap_ses(ci, tcap->session); |
|---|
| 3614 | 3949 | } |
|---|
| 3615 | 3950 | } |
|---|
| 3616 | 3951 | __ceph_remove_cap(cap, false); |
|---|
| .. | .. |
|---|
| 3619 | 3954 | /* add placeholder for the export tagert */ |
|---|
| 3620 | 3955 | int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; |
|---|
| 3621 | 3956 | tcap = new_cap; |
|---|
| 3622 | | - ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, |
|---|
| 3957 | + ceph_add_cap(inode, tsession, t_cap_id, issued, 0, |
|---|
| 3623 | 3958 | t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); |
|---|
| 3624 | 3959 | |
|---|
| 3625 | 3960 | if (!list_empty(&ci->i_cap_flush_list) && |
|---|
| .. | .. |
|---|
| 3679 | 4014 | struct ceph_mds_cap_peer *ph, |
|---|
| 3680 | 4015 | struct ceph_mds_session *session, |
|---|
| 3681 | 4016 | struct ceph_cap **target_cap, int *old_issued) |
|---|
| 3682 | | - __acquires(ci->i_ceph_lock) |
|---|
| 3683 | 4017 | { |
|---|
| 3684 | 4018 | struct ceph_inode_info *ci = ceph_inode(inode); |
|---|
| 3685 | 4019 | struct ceph_cap *cap, *ocap, *new_cap = NULL; |
|---|
| .. | .. |
|---|
| 3704 | 4038 | |
|---|
| 3705 | 4039 | dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", |
|---|
| 3706 | 4040 | inode, ci, mds, mseq, peer); |
|---|
| 3707 | | - |
|---|
| 3708 | 4041 | retry: |
|---|
| 3709 | | - spin_lock(&ci->i_ceph_lock); |
|---|
| 3710 | 4042 | cap = __get_cap_for_mds(ci, mds); |
|---|
| 3711 | 4043 | if (!cap) { |
|---|
| 3712 | 4044 | if (!new_cap) { |
|---|
| 3713 | 4045 | spin_unlock(&ci->i_ceph_lock); |
|---|
| 3714 | 4046 | new_cap = ceph_get_cap(mdsc, NULL); |
|---|
| 4047 | + spin_lock(&ci->i_ceph_lock); |
|---|
| 3715 | 4048 | goto retry; |
|---|
| 3716 | 4049 | } |
|---|
| 3717 | 4050 | cap = new_cap; |
|---|
| .. | .. |
|---|
| 3725 | 4058 | __ceph_caps_issued(ci, &issued); |
|---|
| 3726 | 4059 | issued |= __ceph_caps_dirty(ci); |
|---|
| 3727 | 4060 | |
|---|
| 3728 | | - ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, |
|---|
| 4061 | + ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, |
|---|
| 3729 | 4062 | realmino, CEPH_CAP_FLAG_AUTH, &new_cap); |
|---|
| 3730 | 4063 | |
|---|
| 3731 | 4064 | ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; |
|---|
| .. | .. |
|---|
| 3745 | 4078 | } |
|---|
| 3746 | 4079 | __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); |
|---|
| 3747 | 4080 | } |
|---|
| 3748 | | - |
|---|
| 3749 | | - /* make sure we re-request max_size, if necessary */ |
|---|
| 3750 | | - ci->i_requested_max_size = 0; |
|---|
| 3751 | 4081 | |
|---|
| 3752 | 4082 | *old_issued = issued; |
|---|
| 3753 | 4083 | *target_cap = cap; |
|---|
| .. | .. |
|---|
| 3777 | 4107 | size_t snaptrace_len; |
|---|
| 3778 | 4108 | void *p, *end; |
|---|
| 3779 | 4109 | struct cap_extra_info extra_info = {}; |
|---|
| 4110 | + bool queue_trunc; |
|---|
| 3780 | 4111 | |
|---|
| 3781 | 4112 | dout("handle_caps from mds%d\n", session->s_mds); |
|---|
| 3782 | 4113 | |
|---|
| .. | .. |
|---|
| 3852 | 4183 | } |
|---|
| 3853 | 4184 | } |
|---|
| 3854 | 4185 | |
|---|
| 3855 | | - if (msg_version >= 11) { |
|---|
| 4186 | + if (msg_version >= 9) { |
|---|
| 3856 | 4187 | struct ceph_timespec *btime; |
|---|
| 3857 | | - u64 change_attr; |
|---|
| 3858 | | - u32 flags; |
|---|
| 3859 | 4188 | |
|---|
| 3860 | | - /* version >= 9 */ |
|---|
| 3861 | 4189 | if (p + sizeof(*btime) > end) |
|---|
| 3862 | 4190 | goto bad; |
|---|
| 3863 | 4191 | btime = p; |
|---|
| 4192 | + ceph_decode_timespec64(&extra_info.btime, btime); |
|---|
| 3864 | 4193 | p += sizeof(*btime); |
|---|
| 3865 | | - ceph_decode_64_safe(&p, end, change_attr, bad); |
|---|
| 4194 | + ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); |
|---|
| 4195 | + } |
|---|
| 4196 | + |
|---|
| 4197 | + if (msg_version >= 11) { |
|---|
| 4198 | + u32 flags; |
|---|
| 3866 | 4199 | /* version >= 10 */ |
|---|
| 3867 | 4200 | ceph_decode_32_safe(&p, end, flags, bad); |
|---|
| 3868 | 4201 | /* version >= 11 */ |
|---|
| .. | .. |
|---|
| 3878 | 4211 | vino.snap, inode); |
|---|
| 3879 | 4212 | |
|---|
| 3880 | 4213 | mutex_lock(&session->s_mutex); |
|---|
| 3881 | | - session->s_seq++; |
|---|
| 4214 | + inc_session_sequence(session); |
|---|
| 3882 | 4215 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, |
|---|
| 3883 | 4216 | (unsigned)seq); |
|---|
| 3884 | 4217 | |
|---|
| .. | .. |
|---|
| 3894 | 4227 | cap->seq = seq; |
|---|
| 3895 | 4228 | cap->issue_seq = seq; |
|---|
| 3896 | 4229 | spin_lock(&session->s_cap_lock); |
|---|
| 3897 | | - list_add_tail(&cap->session_caps, |
|---|
| 3898 | | - &session->s_cap_releases); |
|---|
| 3899 | | - session->s_num_cap_releases++; |
|---|
| 4230 | + __ceph_queue_cap_release(session, cap); |
|---|
| 3900 | 4231 | spin_unlock(&session->s_cap_lock); |
|---|
| 3901 | 4232 | } |
|---|
| 3902 | 4233 | goto flush_cap_releases; |
|---|
| .. | .. |
|---|
| 3924 | 4255 | } else { |
|---|
| 3925 | 4256 | down_read(&mdsc->snap_rwsem); |
|---|
| 3926 | 4257 | } |
|---|
| 4258 | + spin_lock(&ci->i_ceph_lock); |
|---|
| 3927 | 4259 | handle_cap_import(mdsc, inode, h, peer, session, |
|---|
| 3928 | 4260 | &cap, &extra_info.issued); |
|---|
| 3929 | 4261 | handle_cap_grant(inode, session, cap, |
|---|
| .. | .. |
|---|
| 3960 | 4292 | break; |
|---|
| 3961 | 4293 | |
|---|
| 3962 | 4294 | case CEPH_CAP_OP_TRUNC: |
|---|
| 3963 | | - handle_cap_trunc(inode, h, session); |
|---|
| 4295 | + queue_trunc = handle_cap_trunc(inode, h, session); |
|---|
| 4296 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 4297 | + if (queue_trunc) |
|---|
| 4298 | + ceph_queue_vmtruncate(inode); |
|---|
| 3964 | 4299 | break; |
|---|
| 3965 | 4300 | |
|---|
| 3966 | 4301 | default: |
|---|
| .. | .. |
|---|
| 3969 | 4304 | ceph_cap_op_name(op)); |
|---|
| 3970 | 4305 | } |
|---|
| 3971 | 4306 | |
|---|
| 3972 | | - goto done; |
|---|
| 4307 | +done: |
|---|
| 4308 | + mutex_unlock(&session->s_mutex); |
|---|
| 4309 | +done_unlocked: |
|---|
| 4310 | + ceph_put_string(extra_info.pool_ns); |
|---|
| 4311 | + /* avoid calling iput_final() in mds dispatch threads */ |
|---|
| 4312 | + ceph_async_iput(inode); |
|---|
| 4313 | + return; |
|---|
| 3973 | 4314 | |
|---|
| 3974 | 4315 | flush_cap_releases: |
|---|
| 3975 | 4316 | /* |
|---|
| .. | .. |
|---|
| 3977 | 4318 | * along for the mds (who clearly thinks we still have this |
|---|
| 3978 | 4319 | * cap). |
|---|
| 3979 | 4320 | */ |
|---|
| 3980 | | - ceph_send_cap_releases(mdsc, session); |
|---|
| 3981 | | - |
|---|
| 3982 | | -done: |
|---|
| 3983 | | - mutex_unlock(&session->s_mutex); |
|---|
| 3984 | | -done_unlocked: |
|---|
| 3985 | | - iput(inode); |
|---|
| 3986 | | - ceph_put_string(extra_info.pool_ns); |
|---|
| 3987 | | - return; |
|---|
| 4321 | + ceph_flush_cap_releases(mdsc, session); |
|---|
| 4322 | + goto done; |
|---|
| 3988 | 4323 | |
|---|
| 3989 | 4324 | bad: |
|---|
| 3990 | 4325 | pr_err("ceph_handle_caps: corrupt message\n"); |
|---|
| .. | .. |
|---|
| 3994 | 4329 | |
|---|
| 3995 | 4330 | /* |
|---|
| 3996 | 4331 | * Delayed work handler to process end of delayed cap release LRU list. |
|---|
| 4332 | + * |
|---|
| 4333 | + * If new caps are added to the list while processing it, these won't get |
|---|
| 4334 | + * processed in this run. In this case, the ci->i_hold_caps_max will be |
|---|
| 4335 | + * returned so that the work can be scheduled accordingly. |
|---|
| 3997 | 4336 | */ |
|---|
| 3998 | | -void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) |
|---|
| 4337 | +unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) |
|---|
| 3999 | 4338 | { |
|---|
| 4000 | 4339 | struct inode *inode; |
|---|
| 4001 | 4340 | struct ceph_inode_info *ci; |
|---|
| 4002 | | - int flags = CHECK_CAPS_NODELAY; |
|---|
| 4341 | + struct ceph_mount_options *opt = mdsc->fsc->mount_options; |
|---|
| 4342 | + unsigned long delay_max = opt->caps_wanted_delay_max * HZ; |
|---|
| 4343 | + unsigned long loop_start = jiffies; |
|---|
| 4344 | + unsigned long delay = 0; |
|---|
| 4003 | 4345 | |
|---|
| 4004 | 4346 | dout("check_delayed_caps\n"); |
|---|
| 4005 | | - while (1) { |
|---|
| 4006 | | - spin_lock(&mdsc->cap_delay_lock); |
|---|
| 4007 | | - if (list_empty(&mdsc->cap_delay_list)) |
|---|
| 4008 | | - break; |
|---|
| 4347 | + spin_lock(&mdsc->cap_delay_lock); |
|---|
| 4348 | + while (!list_empty(&mdsc->cap_delay_list)) { |
|---|
| 4009 | 4349 | ci = list_first_entry(&mdsc->cap_delay_list, |
|---|
| 4010 | 4350 | struct ceph_inode_info, |
|---|
| 4011 | 4351 | i_cap_delay_list); |
|---|
| 4352 | + if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { |
|---|
| 4353 | + dout("%s caps added recently. Exiting loop", __func__); |
|---|
| 4354 | + delay = ci->i_hold_caps_max; |
|---|
| 4355 | + break; |
|---|
| 4356 | + } |
|---|
| 4012 | 4357 | if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && |
|---|
| 4013 | 4358 | time_before(jiffies, ci->i_hold_caps_max)) |
|---|
| 4014 | 4359 | break; |
|---|
| 4015 | 4360 | list_del_init(&ci->i_cap_delay_list); |
|---|
| 4016 | 4361 | |
|---|
| 4017 | 4362 | inode = igrab(&ci->vfs_inode); |
|---|
| 4018 | | - spin_unlock(&mdsc->cap_delay_lock); |
|---|
| 4019 | | - |
|---|
| 4020 | 4363 | if (inode) { |
|---|
| 4364 | + spin_unlock(&mdsc->cap_delay_lock); |
|---|
| 4021 | 4365 | dout("check_delayed_caps on %p\n", inode); |
|---|
| 4022 | | - ceph_check_caps(ci, flags, NULL); |
|---|
| 4023 | | - iput(inode); |
|---|
| 4366 | + ceph_check_caps(ci, 0, NULL); |
|---|
| 4367 | + /* avoid calling iput_final() in tick thread */ |
|---|
| 4368 | + ceph_async_iput(inode); |
|---|
| 4369 | + spin_lock(&mdsc->cap_delay_lock); |
|---|
| 4024 | 4370 | } |
|---|
| 4025 | 4371 | } |
|---|
| 4026 | 4372 | spin_unlock(&mdsc->cap_delay_lock); |
|---|
| 4373 | + |
|---|
| 4374 | + return delay; |
|---|
| 4027 | 4375 | } |
|---|
| 4028 | 4376 | |
|---|
| 4029 | 4377 | /* |
|---|
| 4030 | 4378 | * Flush all dirty caps to the mds |
|---|
| 4031 | 4379 | */ |
|---|
| 4032 | | -void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) |
|---|
| 4380 | +static void flush_dirty_session_caps(struct ceph_mds_session *s) |
|---|
| 4033 | 4381 | { |
|---|
| 4382 | + struct ceph_mds_client *mdsc = s->s_mdsc; |
|---|
| 4034 | 4383 | struct ceph_inode_info *ci; |
|---|
| 4035 | 4384 | struct inode *inode; |
|---|
| 4036 | 4385 | |
|---|
| 4037 | 4386 | dout("flush_dirty_caps\n"); |
|---|
| 4038 | 4387 | spin_lock(&mdsc->cap_dirty_lock); |
|---|
| 4039 | | - while (!list_empty(&mdsc->cap_dirty)) { |
|---|
| 4040 | | - ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, |
|---|
| 4388 | + while (!list_empty(&s->s_cap_dirty)) { |
|---|
| 4389 | + ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, |
|---|
| 4041 | 4390 | i_dirty_item); |
|---|
| 4042 | 4391 | inode = &ci->vfs_inode; |
|---|
| 4043 | 4392 | ihold(inode); |
|---|
| 4044 | 4393 | dout("flush_dirty_caps %p\n", inode); |
|---|
| 4045 | 4394 | spin_unlock(&mdsc->cap_dirty_lock); |
|---|
| 4046 | | - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); |
|---|
| 4395 | + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); |
|---|
| 4047 | 4396 | iput(inode); |
|---|
| 4048 | 4397 | spin_lock(&mdsc->cap_dirty_lock); |
|---|
| 4049 | 4398 | } |
|---|
| .. | .. |
|---|
| 4051 | 4400 | dout("flush_dirty_caps done\n"); |
|---|
| 4052 | 4401 | } |
|---|
| 4053 | 4402 | |
|---|
| 4054 | | -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) |
|---|
| 4403 | +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) |
|---|
| 4055 | 4404 | { |
|---|
| 4056 | | - int i; |
|---|
| 4405 | + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); |
|---|
| 4406 | +} |
|---|
| 4407 | + |
|---|
| 4408 | +void __ceph_touch_fmode(struct ceph_inode_info *ci, |
|---|
| 4409 | + struct ceph_mds_client *mdsc, int fmode) |
|---|
| 4410 | +{ |
|---|
| 4411 | + unsigned long now = jiffies; |
|---|
| 4412 | + if (fmode & CEPH_FILE_MODE_RD) |
|---|
| 4413 | + ci->i_last_rd = now; |
|---|
| 4414 | + if (fmode & CEPH_FILE_MODE_WR) |
|---|
| 4415 | + ci->i_last_wr = now; |
|---|
| 4416 | + /* queue periodic check */ |
|---|
| 4417 | + if (fmode && |
|---|
| 4418 | + __ceph_is_any_real_caps(ci) && |
|---|
| 4419 | + list_empty(&ci->i_cap_delay_list)) |
|---|
| 4420 | + __cap_delay_requeue(mdsc, ci); |
|---|
| 4421 | +} |
|---|
| 4422 | + |
|---|
| 4423 | +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) |
|---|
| 4424 | +{ |
|---|
| 4425 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); |
|---|
| 4057 | 4426 | int bits = (fmode << 1) | 1; |
|---|
| 4427 | + bool already_opened = false; |
|---|
| 4428 | + int i; |
|---|
| 4429 | + |
|---|
| 4430 | + if (count == 1) |
|---|
| 4431 | + atomic64_inc(&mdsc->metric.opened_files); |
|---|
| 4432 | + |
|---|
| 4433 | + spin_lock(&ci->i_ceph_lock); |
|---|
| 4058 | 4434 | for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { |
|---|
| 4435 | + /* |
|---|
| 4436 | + * If any of the mode ref is larger than 0, |
|---|
| 4437 | + * that means it has been already opened by |
|---|
| 4438 | + * others. Just skip checking the PIN ref. |
|---|
| 4439 | + */ |
|---|
| 4440 | + if (i && ci->i_nr_by_mode[i]) |
|---|
| 4441 | + already_opened = true; |
|---|
| 4442 | + |
|---|
| 4059 | 4443 | if (bits & (1 << i)) |
|---|
| 4060 | | - ci->i_nr_by_mode[i]++; |
|---|
| 4444 | + ci->i_nr_by_mode[i] += count; |
|---|
| 4061 | 4445 | } |
|---|
| 4446 | + |
|---|
| 4447 | + if (!already_opened) |
|---|
| 4448 | + percpu_counter_inc(&mdsc->metric.opened_inodes); |
|---|
| 4449 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 4062 | 4450 | } |
|---|
| 4063 | 4451 | |
|---|
| 4064 | 4452 | /* |
|---|
| .. | .. |
|---|
| 4066 | 4454 | * we may need to release capabilities to the MDS (or schedule |
|---|
| 4067 | 4455 | * their delayed release). |
|---|
| 4068 | 4456 | */ |
|---|
| 4069 | | -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) |
|---|
| 4457 | +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) |
|---|
| 4070 | 4458 | { |
|---|
| 4071 | | - int i, last = 0; |
|---|
| 4459 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); |
|---|
| 4072 | 4460 | int bits = (fmode << 1) | 1; |
|---|
| 4461 | + bool is_closed = true; |
|---|
| 4462 | + int i; |
|---|
| 4463 | + |
|---|
| 4464 | + if (count == 1) |
|---|
| 4465 | + atomic64_dec(&mdsc->metric.opened_files); |
|---|
| 4466 | + |
|---|
| 4073 | 4467 | spin_lock(&ci->i_ceph_lock); |
|---|
| 4074 | 4468 | for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { |
|---|
| 4075 | 4469 | if (bits & (1 << i)) { |
|---|
| 4076 | | - BUG_ON(ci->i_nr_by_mode[i] == 0); |
|---|
| 4077 | | - if (--ci->i_nr_by_mode[i] == 0) |
|---|
| 4078 | | - last++; |
|---|
| 4470 | + BUG_ON(ci->i_nr_by_mode[i] < count); |
|---|
| 4471 | + ci->i_nr_by_mode[i] -= count; |
|---|
| 4079 | 4472 | } |
|---|
| 4080 | | - } |
|---|
| 4081 | | - dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", |
|---|
| 4082 | | - &ci->vfs_inode, fmode, |
|---|
| 4083 | | - ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], |
|---|
| 4084 | | - ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); |
|---|
| 4085 | | - spin_unlock(&ci->i_ceph_lock); |
|---|
| 4086 | 4473 | |
|---|
| 4087 | | - if (last && ci->i_vino.snap == CEPH_NOSNAP) |
|---|
| 4088 | | - ceph_check_caps(ci, 0, NULL); |
|---|
| 4474 | + /* |
|---|
| 4475 | + * If any of the mode ref is not 0 after |
|---|
| 4476 | + * decreased, that means it is still opened |
|---|
| 4477 | + * by others. Just skip checking the PIN ref. |
|---|
| 4478 | + */ |
|---|
| 4479 | + if (i && ci->i_nr_by_mode[i]) |
|---|
| 4480 | + is_closed = false; |
|---|
| 4481 | + } |
|---|
| 4482 | + |
|---|
| 4483 | + if (is_closed) |
|---|
| 4484 | + percpu_counter_dec(&mdsc->metric.opened_inodes); |
|---|
| 4485 | + spin_unlock(&ci->i_ceph_lock); |
|---|
| 4089 | 4486 | } |
|---|
| 4090 | 4487 | |
|---|
| 4091 | 4488 | /* |
|---|
| 4092 | | - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it |
|---|
| 4489 | + * For a soon-to-be unlinked file, drop the LINK caps. If it |
|---|
| 4093 | 4490 | * looks like the link count will hit 0, drop any other caps (other |
|---|
| 4094 | 4491 | * than PIN) we don't specifically want (due to the file still being |
|---|
| 4095 | 4492 | * open). |
|---|
| .. | .. |
|---|
| 4103 | 4500 | if (inode->i_nlink == 1) { |
|---|
| 4104 | 4501 | drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); |
|---|
| 4105 | 4502 | |
|---|
| 4106 | | - ci->i_ceph_flags |= CEPH_I_NODELAY; |
|---|
| 4107 | 4503 | if (__ceph_caps_dirty(ci)) { |
|---|
| 4108 | 4504 | struct ceph_mds_client *mdsc = |
|---|
| 4109 | 4505 | ceph_inode_to_client(inode)->mdsc; |
|---|
| .. | .. |
|---|
| 4159 | 4555 | if (force || (cap->issued & drop)) { |
|---|
| 4160 | 4556 | if (cap->issued & drop) { |
|---|
| 4161 | 4557 | int wanted = __ceph_caps_wanted(ci); |
|---|
| 4162 | | - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) |
|---|
| 4163 | | - wanted |= cap->mds_wanted; |
|---|
| 4164 | 4558 | dout("encode_inode_release %p cap %p " |
|---|
| 4165 | 4559 | "%s -> %s, wanted %s -> %s\n", inode, cap, |
|---|
| 4166 | 4560 | ceph_cap_string(cap->issued), |
|---|
| .. | .. |
|---|
| 4171 | 4565 | cap->issued &= ~drop; |
|---|
| 4172 | 4566 | cap->implemented &= ~drop; |
|---|
| 4173 | 4567 | cap->mds_wanted = wanted; |
|---|
| 4568 | + if (cap == ci->i_auth_cap && |
|---|
| 4569 | + !(wanted & CEPH_CAP_ANY_FILE_WR)) |
|---|
| 4570 | + ci->i_requested_max_size = 0; |
|---|
| 4174 | 4571 | } else { |
|---|
| 4175 | 4572 | dout("encode_inode_release %p cap %p %s" |
|---|
| 4176 | 4573 | " (force)\n", inode, cap, |
|---|