| .. | .. | 
|---|
| 8 | 8 | #include <linux/vmalloc.h> | 
|---|
| 9 | 9 | #include <linux/wait.h> | 
|---|
| 10 | 10 | #include <linux/writeback.h> | 
|---|
|  | 11 | +#include <linux/iversion.h> | 
|---|
| 11 | 12 |  | 
|---|
| 12 | 13 | #include "super.h" | 
|---|
| 13 | 14 | #include "mds_client.h" | 
|---|
| .. | .. | 
|---|
| 148 | 149 | spin_unlock(&mdsc->caps_list_lock); | 
|---|
| 149 | 150 | } | 
|---|
| 150 | 151 |  | 
|---|
| 151 |  | -void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) | 
|---|
|  | 152 | +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, | 
|---|
|  | 153 | +			      struct ceph_mount_options *fsopt) | 
|---|
| 152 | 154 | { | 
|---|
| 153 | 155 | spin_lock(&mdsc->caps_list_lock); | 
|---|
| 154 |  | -	mdsc->caps_min_count += delta; | 
|---|
| 155 |  | -	BUG_ON(mdsc->caps_min_count < 0); | 
|---|
|  | 156 | +	mdsc->caps_min_count = fsopt->max_readdir; | 
|---|
|  | 157 | +	if (mdsc->caps_min_count < 1024) | 
|---|
|  | 158 | +		mdsc->caps_min_count = 1024; | 
|---|
|  | 159 | +	mdsc->caps_use_max = fsopt->caps_max; | 
|---|
|  | 160 | +	if (mdsc->caps_use_max > 0 && | 
|---|
|  | 161 | +	    mdsc->caps_use_max < mdsc->caps_min_count) | 
|---|
|  | 162 | +		mdsc->caps_use_max = mdsc->caps_min_count; | 
|---|
| 156 | 163 | spin_unlock(&mdsc->caps_list_lock); | 
|---|
| 157 | 164 | } | 
|---|
| 158 | 165 |  | 
|---|
| .. | .. | 
|---|
| 272 | 279 | if (!err) { | 
|---|
| 273 | 280 | BUG_ON(have + alloc != need); | 
|---|
| 274 | 281 | ctx->count = need; | 
|---|
|  | 282 | +		ctx->used = 0; | 
|---|
| 275 | 283 | } | 
|---|
| 276 | 284 |  | 
|---|
| 277 | 285 | spin_lock(&mdsc->caps_list_lock); | 
|---|
| .. | .. | 
|---|
| 295 | 303 | } | 
|---|
| 296 | 304 |  | 
|---|
| 297 | 305 | void ceph_unreserve_caps(struct ceph_mds_client *mdsc, | 
|---|
| 298 |  | -			struct ceph_cap_reservation *ctx) | 
|---|
|  | 306 | +			 struct ceph_cap_reservation *ctx) | 
|---|
| 299 | 307 | { | 
|---|
|  | 308 | +	bool reclaim = false; | 
|---|
|  | 309 | +	if (!ctx->count) | 
|---|
|  | 310 | +		return; | 
|---|
|  | 311 | + | 
|---|
| 300 | 312 | dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); | 
|---|
| 301 | 313 | spin_lock(&mdsc->caps_list_lock); | 
|---|
| 302 | 314 | __ceph_unreserve_caps(mdsc, ctx->count); | 
|---|
| 303 | 315 | ctx->count = 0; | 
|---|
|  | 316 | + | 
|---|
|  | 317 | +	if (mdsc->caps_use_max > 0 && | 
|---|
|  | 318 | +	    mdsc->caps_use_count > mdsc->caps_use_max) | 
|---|
|  | 319 | +		reclaim = true; | 
|---|
| 304 | 320 | spin_unlock(&mdsc->caps_list_lock); | 
|---|
|  | 321 | + | 
|---|
|  | 322 | +	if (reclaim) | 
|---|
|  | 323 | +		ceph_reclaim_caps_nr(mdsc, ctx->used); | 
|---|
| 305 | 324 | } | 
|---|
| 306 | 325 |  | 
|---|
| 307 | 326 | struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, | 
|---|
| .. | .. | 
|---|
| 346 | 365 | BUG_ON(list_empty(&mdsc->caps_list)); | 
|---|
| 347 | 366 |  | 
|---|
| 348 | 367 | ctx->count--; | 
|---|
|  | 368 | +	ctx->used++; | 
|---|
| 349 | 369 | mdsc->caps_reserve_count--; | 
|---|
| 350 | 370 | mdsc->caps_use_count++; | 
|---|
| 351 | 371 |  | 
|---|
| .. | .. | 
|---|
| 438 | 458 | } | 
|---|
| 439 | 459 |  | 
|---|
| 440 | 460 | /* | 
|---|
| 441 |  | - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. | 
|---|
| 442 |  | - */ | 
|---|
| 443 |  | -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) | 
|---|
| 444 |  | -{ | 
|---|
| 445 |  | -	struct ceph_cap *cap; | 
|---|
| 446 |  | -	int mds = -1; | 
|---|
| 447 |  | -	struct rb_node *p; | 
|---|
| 448 |  | - | 
|---|
| 449 |  | -	/* prefer mds with WR|BUFFER|EXCL caps */ | 
|---|
| 450 |  | -	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 
|---|
| 451 |  | -		cap = rb_entry(p, struct ceph_cap, ci_node); | 
|---|
| 452 |  | -		mds = cap->mds; | 
|---|
| 453 |  | -		if (cap->issued & (CEPH_CAP_FILE_WR | | 
|---|
| 454 |  | -				   CEPH_CAP_FILE_BUFFER | | 
|---|
| 455 |  | -				   CEPH_CAP_FILE_EXCL)) | 
|---|
| 456 |  | -			break; | 
|---|
| 457 |  | -	} | 
|---|
| 458 |  | -	return mds; | 
|---|
| 459 |  | -} | 
|---|
| 460 |  | - | 
|---|
| 461 |  | -int ceph_get_cap_mds(struct inode *inode) | 
|---|
| 462 |  | -{ | 
|---|
| 463 |  | -	struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 464 |  | -	int mds; | 
|---|
| 465 |  | -	spin_lock(&ci->i_ceph_lock); | 
|---|
| 466 |  | -	mds = __ceph_get_cap_mds(ceph_inode(inode)); | 
|---|
| 467 |  | -	spin_unlock(&ci->i_ceph_lock); | 
|---|
| 468 |  | -	return mds; | 
|---|
| 469 |  | -} | 
|---|
| 470 |  | - | 
|---|
| 471 |  | -/* | 
|---|
| 472 | 461 | * Called under i_ceph_lock. | 
|---|
| 473 | 462 | */ | 
|---|
| 474 | 463 | static void __insert_cap_node(struct ceph_inode_info *ci, | 
|---|
| .. | .. | 
|---|
| 500 | 489 | static void __cap_set_timeouts(struct ceph_mds_client *mdsc, | 
|---|
| 501 | 490 | struct ceph_inode_info *ci) | 
|---|
| 502 | 491 | { | 
|---|
| 503 |  | -	struct ceph_mount_options *ma = mdsc->fsc->mount_options; | 
|---|
| 504 |  | - | 
|---|
| 505 |  | -	ci->i_hold_caps_min = round_jiffies(jiffies + | 
|---|
| 506 |  | -					    ma->caps_wanted_delay_min * HZ); | 
|---|
|  | 492 | +	struct ceph_mount_options *opt = mdsc->fsc->mount_options; | 
|---|
| 507 | 493 | ci->i_hold_caps_max = round_jiffies(jiffies + | 
|---|
| 508 |  | -					    ma->caps_wanted_delay_max * HZ); | 
|---|
| 509 |  | -	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, | 
|---|
| 510 |  | -	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); | 
|---|
|  | 494 | +					    opt->caps_wanted_delay_max * HZ); | 
|---|
|  | 495 | +	dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, | 
|---|
|  | 496 | +	     ci->i_hold_caps_max - jiffies); | 
|---|
| 511 | 497 | } | 
|---|
| 512 | 498 |  | 
|---|
| 513 | 499 | /* | 
|---|
| .. | .. | 
|---|
| 521 | 507 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, | 
|---|
| 522 | 508 | struct ceph_inode_info *ci) | 
|---|
| 523 | 509 | { | 
|---|
| 524 |  | -	__cap_set_timeouts(mdsc, ci); | 
|---|
| 525 |  | -	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, | 
|---|
|  | 510 | +	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, | 
|---|
| 526 | 511 | ci->i_ceph_flags, ci->i_hold_caps_max); | 
|---|
| 527 | 512 | if (!mdsc->stopping) { | 
|---|
| 528 | 513 | spin_lock(&mdsc->cap_delay_lock); | 
|---|
| .. | .. | 
|---|
| 531 | 516 | goto no_change; | 
|---|
| 532 | 517 | list_del_init(&ci->i_cap_delay_list); | 
|---|
| 533 | 518 | } | 
|---|
|  | 519 | +		__cap_set_timeouts(mdsc, ci); | 
|---|
| 534 | 520 | list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); | 
|---|
| 535 | 521 | no_change: | 
|---|
| 536 | 522 | spin_unlock(&mdsc->cap_delay_lock); | 
|---|
| .. | .. | 
|---|
| 570 | 556 | spin_unlock(&mdsc->cap_delay_lock); | 
|---|
| 571 | 557 | } | 
|---|
| 572 | 558 |  | 
|---|
| 573 |  | -/* | 
|---|
| 574 |  | - * Common issue checks for add_cap, handle_cap_grant. | 
|---|
| 575 |  | - */ | 
|---|
|  | 559 | +/* Common issue checks for add_cap, handle_cap_grant. */ | 
|---|
| 576 | 560 | static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, | 
|---|
| 577 | 561 | unsigned issued) | 
|---|
| 578 | 562 | { | 
|---|
| 579 | 563 | unsigned had = __ceph_caps_issued(ci, NULL); | 
|---|
| 580 | 564 |  | 
|---|
|  | 565 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
|  | 566 | + | 
|---|
| 581 | 567 | /* | 
|---|
| 582 | 568 | * Each time we receive FILE_CACHE anew, we increment | 
|---|
| 583 | 569 | * i_rdcache_gen. | 
|---|
| 584 | 570 | */ | 
|---|
| 585 |  | -	if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && | 
|---|
|  | 571 | +	if (S_ISREG(ci->vfs_inode.i_mode) && | 
|---|
|  | 572 | +	    (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && | 
|---|
| 586 | 573 | (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { | 
|---|
| 587 | 574 | ci->i_rdcache_gen++; | 
|---|
| 588 | 575 | } | 
|---|
| .. | .. | 
|---|
| 601 | 588 | __ceph_dir_clear_complete(ci); | 
|---|
| 602 | 589 | } | 
|---|
| 603 | 590 | } | 
|---|
|  | 591 | + | 
|---|
|  | 592 | +	/* Wipe saved layout if we're losing DIR_CREATE caps */ | 
|---|
|  | 593 | +	if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && | 
|---|
|  | 594 | +		!(issued & CEPH_CAP_DIR_CREATE)) { | 
|---|
|  | 595 | +	     ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); | 
|---|
|  | 596 | +	     memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); | 
|---|
|  | 597 | +	} | 
|---|
|  | 598 | +} | 
|---|
|  | 599 | + | 
|---|
|  | 600 | +/** | 
|---|
|  | 601 | + * change_auth_cap_ses - move inode to appropriate lists when auth caps change | 
|---|
|  | 602 | + * @ci: inode to be moved | 
|---|
|  | 603 | + * @session: new auth caps session | 
|---|
|  | 604 | + */ | 
|---|
|  | 605 | +static void change_auth_cap_ses(struct ceph_inode_info *ci, | 
|---|
|  | 606 | +				struct ceph_mds_session *session) | 
|---|
|  | 607 | +{ | 
|---|
|  | 608 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
|  | 609 | + | 
|---|
|  | 610 | +	if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) | 
|---|
|  | 611 | +		return; | 
|---|
|  | 612 | + | 
|---|
|  | 613 | +	spin_lock(&session->s_mdsc->cap_dirty_lock); | 
|---|
|  | 614 | +	if (!list_empty(&ci->i_dirty_item)) | 
|---|
|  | 615 | +		list_move(&ci->i_dirty_item, &session->s_cap_dirty); | 
|---|
|  | 616 | +	if (!list_empty(&ci->i_flushing_item)) | 
|---|
|  | 617 | +		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); | 
|---|
|  | 618 | +	spin_unlock(&session->s_mdsc->cap_dirty_lock); | 
|---|
| 604 | 619 | } | 
|---|
| 605 | 620 |  | 
|---|
| 606 | 621 | /* | 
|---|
| 607 | 622 | * Add a capability under the given MDS session. | 
|---|
| 608 | 623 | * | 
|---|
| 609 |  | - * Caller should hold session snap_rwsem (read) and s_mutex. | 
|---|
|  | 624 | + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock | 
|---|
| 610 | 625 | * | 
|---|
| 611 | 626 | * @fmode is the open file mode, if we are opening a file, otherwise | 
|---|
| 612 | 627 | * it is < 0.  (This is so we can atomically add the cap and add an | 
|---|
| .. | .. | 
|---|
| 614 | 629 | */ | 
|---|
| 615 | 630 | void ceph_add_cap(struct inode *inode, | 
|---|
| 616 | 631 | struct ceph_mds_session *session, u64 cap_id, | 
|---|
| 617 |  | -		  int fmode, unsigned issued, unsigned wanted, | 
|---|
|  | 632 | +		  unsigned issued, unsigned wanted, | 
|---|
| 618 | 633 | unsigned seq, unsigned mseq, u64 realmino, int flags, | 
|---|
| 619 | 634 | struct ceph_cap **new_cap) | 
|---|
| 620 | 635 | { | 
|---|
| .. | .. | 
|---|
| 623 | 638 | struct ceph_cap *cap; | 
|---|
| 624 | 639 | int mds = session->s_mds; | 
|---|
| 625 | 640 | int actual_wanted; | 
|---|
|  | 641 | +	u32 gen; | 
|---|
|  | 642 | + | 
|---|
|  | 643 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
| 626 | 644 |  | 
|---|
| 627 | 645 | dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, | 
|---|
| 628 | 646 | session->s_mds, cap_id, ceph_cap_string(issued), seq); | 
|---|
| 629 | 647 |  | 
|---|
| 630 |  | -	/* | 
|---|
| 631 |  | -	 * If we are opening the file, include file mode wanted bits | 
|---|
| 632 |  | -	 * in wanted. | 
|---|
| 633 |  | -	 */ | 
|---|
| 634 |  | -	if (fmode >= 0) | 
|---|
| 635 |  | -		wanted |= ceph_caps_for_mode(fmode); | 
|---|
|  | 648 | +	spin_lock(&session->s_gen_ttl_lock); | 
|---|
|  | 649 | +	gen = session->s_cap_gen; | 
|---|
|  | 650 | +	spin_unlock(&session->s_gen_ttl_lock); | 
|---|
| 636 | 651 |  | 
|---|
| 637 | 652 | cap = __get_cap_for_mds(ci, mds); | 
|---|
| 638 | 653 | if (!cap) { | 
|---|
| .. | .. | 
|---|
| 653 | 668 | spin_lock(&session->s_cap_lock); | 
|---|
| 654 | 669 | list_add_tail(&cap->session_caps, &session->s_caps); | 
|---|
| 655 | 670 | session->s_nr_caps++; | 
|---|
|  | 671 | +		atomic64_inc(&mdsc->metric.total_caps); | 
|---|
| 656 | 672 | spin_unlock(&session->s_cap_lock); | 
|---|
| 657 | 673 | } else { | 
|---|
|  | 674 | +		spin_lock(&session->s_cap_lock); | 
|---|
|  | 675 | +		list_move_tail(&cap->session_caps, &session->s_caps); | 
|---|
|  | 676 | +		spin_unlock(&session->s_cap_lock); | 
|---|
|  | 677 | + | 
|---|
|  | 678 | +		if (cap->cap_gen < gen) | 
|---|
|  | 679 | +			cap->issued = cap->implemented = CEPH_CAP_PIN; | 
|---|
|  | 680 | + | 
|---|
| 658 | 681 | /* | 
|---|
| 659 | 682 | * auth mds of the inode changed. we received the cap export | 
|---|
| 660 | 683 | * message, but still haven't received the cap import message. | 
|---|
| .. | .. | 
|---|
| 726 | 749 | if (flags & CEPH_CAP_FLAG_AUTH) { | 
|---|
| 727 | 750 | if (!ci->i_auth_cap || | 
|---|
| 728 | 751 | ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { | 
|---|
|  | 752 | +			if (ci->i_auth_cap && | 
|---|
|  | 753 | +			    ci->i_auth_cap->session != cap->session) | 
|---|
|  | 754 | +				change_auth_cap_ses(ci, cap->session); | 
|---|
| 729 | 755 | ci->i_auth_cap = cap; | 
|---|
| 730 | 756 | cap->mds_wanted = wanted; | 
|---|
| 731 | 757 | } | 
|---|
| .. | .. | 
|---|
| 746 | 772 | cap->seq = seq; | 
|---|
| 747 | 773 | cap->issue_seq = seq; | 
|---|
| 748 | 774 | cap->mseq = mseq; | 
|---|
| 749 |  | -	cap->cap_gen = session->s_cap_gen; | 
|---|
| 750 |  | - | 
|---|
| 751 |  | -	if (fmode >= 0) | 
|---|
| 752 |  | -		__ceph_get_fmode(ci, fmode); | 
|---|
|  | 775 | +	cap->cap_gen = gen; | 
|---|
| 753 | 776 | } | 
|---|
| 754 | 777 |  | 
|---|
| 755 | 778 | /* | 
|---|
| .. | .. | 
|---|
| 864 | 887 | int have = ci->i_snap_caps; | 
|---|
| 865 | 888 |  | 
|---|
| 866 | 889 | if ((have & mask) == mask) { | 
|---|
| 867 |  | -		dout("__ceph_caps_issued_mask %p snap issued %s" | 
|---|
| 868 |  | -		     " (mask %s)\n", &ci->vfs_inode, | 
|---|
|  | 890 | +		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" | 
|---|
|  | 891 | +		     " (mask %s)\n", ceph_ino(&ci->vfs_inode), | 
|---|
| 869 | 892 | ceph_cap_string(have), | 
|---|
| 870 | 893 | ceph_cap_string(mask)); | 
|---|
| 871 | 894 | return 1; | 
|---|
| .. | .. | 
|---|
| 876 | 899 | if (!__cap_is_valid(cap)) | 
|---|
| 877 | 900 | continue; | 
|---|
| 878 | 901 | if ((cap->issued & mask) == mask) { | 
|---|
| 879 |  | -			dout("__ceph_caps_issued_mask %p cap %p issued %s" | 
|---|
| 880 |  | -			     " (mask %s)\n", &ci->vfs_inode, cap, | 
|---|
|  | 902 | +			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" | 
|---|
|  | 903 | +			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, | 
|---|
| 881 | 904 | ceph_cap_string(cap->issued), | 
|---|
| 882 | 905 | ceph_cap_string(mask)); | 
|---|
| 883 | 906 | if (touch) | 
|---|
| .. | .. | 
|---|
| 888 | 911 | /* does a combination of caps satisfy mask? */ | 
|---|
| 889 | 912 | have |= cap->issued; | 
|---|
| 890 | 913 | if ((have & mask) == mask) { | 
|---|
| 891 |  | -			dout("__ceph_caps_issued_mask %p combo issued %s" | 
|---|
| 892 |  | -			     " (mask %s)\n", &ci->vfs_inode, | 
|---|
|  | 914 | +			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" | 
|---|
|  | 915 | +			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), | 
|---|
| 893 | 916 | ceph_cap_string(cap->issued), | 
|---|
| 894 | 917 | ceph_cap_string(mask)); | 
|---|
| 895 | 918 | if (touch) { | 
|---|
| .. | .. | 
|---|
| 903 | 926 | ci_node); | 
|---|
| 904 | 927 | if (!__cap_is_valid(cap)) | 
|---|
| 905 | 928 | continue; | 
|---|
| 906 |  | -					__touch_cap(cap); | 
|---|
|  | 929 | +					if (cap->issued & mask) | 
|---|
|  | 930 | +						__touch_cap(cap); | 
|---|
| 907 | 931 | } | 
|---|
| 908 | 932 | } | 
|---|
| 909 | 933 | return 1; | 
|---|
| .. | .. | 
|---|
| 911 | 935 | } | 
|---|
| 912 | 936 |  | 
|---|
| 913 | 937 | return 0; | 
|---|
|  | 938 | +} | 
|---|
|  | 939 | + | 
|---|
|  | 940 | +int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, | 
|---|
|  | 941 | +				   int touch) | 
|---|
|  | 942 | +{ | 
|---|
|  | 943 | +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); | 
|---|
|  | 944 | +	int r; | 
|---|
|  | 945 | + | 
|---|
|  | 946 | +	r = __ceph_caps_issued_mask(ci, mask, touch); | 
|---|
|  | 947 | +	if (r) | 
|---|
|  | 948 | +		ceph_update_cap_hit(&fsc->mdsc->metric); | 
|---|
|  | 949 | +	else | 
|---|
|  | 950 | +		ceph_update_cap_mis(&fsc->mdsc->metric); | 
|---|
|  | 951 | +	return r; | 
|---|
| 914 | 952 | } | 
|---|
| 915 | 953 |  | 
|---|
| 916 | 954 | /* | 
|---|
| .. | .. | 
|---|
| 952 | 990 | if (ci->i_rd_ref) | 
|---|
| 953 | 991 | used |= CEPH_CAP_FILE_RD; | 
|---|
| 954 | 992 | if (ci->i_rdcache_ref || | 
|---|
| 955 |  | -	    (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ | 
|---|
|  | 993 | +	    (S_ISREG(ci->vfs_inode.i_mode) && | 
|---|
| 956 | 994 | ci->vfs_inode.i_data.nrpages)) | 
|---|
| 957 | 995 | used |= CEPH_CAP_FILE_CACHE; | 
|---|
| 958 | 996 | if (ci->i_wr_ref) | 
|---|
| 959 | 997 | used |= CEPH_CAP_FILE_WR; | 
|---|
| 960 | 998 | if (ci->i_wb_ref || ci->i_wrbuffer_ref) | 
|---|
| 961 | 999 | used |= CEPH_CAP_FILE_BUFFER; | 
|---|
|  | 1000 | +	if (ci->i_fx_ref) | 
|---|
|  | 1001 | +		used |= CEPH_CAP_FILE_EXCL; | 
|---|
| 962 | 1002 | return used; | 
|---|
| 963 | 1003 | } | 
|---|
|  | 1004 | + | 
|---|
|  | 1005 | +#define FMODE_WAIT_BIAS 1000 | 
|---|
| 964 | 1006 |  | 
|---|
| 965 | 1007 | /* | 
|---|
| 966 | 1008 | * wanted, by virtue of open file modes | 
|---|
| 967 | 1009 | */ | 
|---|
| 968 | 1010 | int __ceph_caps_file_wanted(struct ceph_inode_info *ci) | 
|---|
| 969 | 1011 | { | 
|---|
| 970 |  | -	int i, bits = 0; | 
|---|
| 971 |  | -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { | 
|---|
| 972 |  | -		if (ci->i_nr_by_mode[i]) | 
|---|
| 973 |  | -			bits |= 1 << i; | 
|---|
|  | 1012 | +	const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); | 
|---|
|  | 1013 | +	const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); | 
|---|
|  | 1014 | +	const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); | 
|---|
|  | 1015 | +	const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); | 
|---|
|  | 1016 | +	struct ceph_mount_options *opt = | 
|---|
|  | 1017 | +		ceph_inode_to_client(&ci->vfs_inode)->mount_options; | 
|---|
|  | 1018 | +	unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; | 
|---|
|  | 1019 | +	unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; | 
|---|
|  | 1020 | + | 
|---|
|  | 1021 | +	if (S_ISDIR(ci->vfs_inode.i_mode)) { | 
|---|
|  | 1022 | +		int want = 0; | 
|---|
|  | 1023 | + | 
|---|
|  | 1024 | +		/* use used_cutoff here, to keep dir's wanted caps longer */ | 
|---|
|  | 1025 | +		if (ci->i_nr_by_mode[RD_SHIFT] > 0 || | 
|---|
|  | 1026 | +		    time_after(ci->i_last_rd, used_cutoff)) | 
|---|
|  | 1027 | +			want |= CEPH_CAP_ANY_SHARED; | 
|---|
|  | 1028 | + | 
|---|
|  | 1029 | +		if (ci->i_nr_by_mode[WR_SHIFT] > 0 || | 
|---|
|  | 1030 | +		    time_after(ci->i_last_wr, used_cutoff)) { | 
|---|
|  | 1031 | +			want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; | 
|---|
|  | 1032 | +			if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) | 
|---|
|  | 1033 | +				want |= CEPH_CAP_ANY_DIR_OPS; | 
|---|
|  | 1034 | +		} | 
|---|
|  | 1035 | + | 
|---|
|  | 1036 | +		if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) | 
|---|
|  | 1037 | +			want |= CEPH_CAP_PIN; | 
|---|
|  | 1038 | + | 
|---|
|  | 1039 | +		return want; | 
|---|
|  | 1040 | +	} else { | 
|---|
|  | 1041 | +		int bits = 0; | 
|---|
|  | 1042 | + | 
|---|
|  | 1043 | +		if (ci->i_nr_by_mode[RD_SHIFT] > 0) { | 
|---|
|  | 1044 | +			if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || | 
|---|
|  | 1045 | +			    time_after(ci->i_last_rd, used_cutoff)) | 
|---|
|  | 1046 | +				bits |= 1 << RD_SHIFT; | 
|---|
|  | 1047 | +		} else if (time_after(ci->i_last_rd, idle_cutoff)) { | 
|---|
|  | 1048 | +			bits |= 1 << RD_SHIFT; | 
|---|
|  | 1049 | +		} | 
|---|
|  | 1050 | + | 
|---|
|  | 1051 | +		if (ci->i_nr_by_mode[WR_SHIFT] > 0) { | 
|---|
|  | 1052 | +			if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || | 
|---|
|  | 1053 | +			    time_after(ci->i_last_wr, used_cutoff)) | 
|---|
|  | 1054 | +				bits |= 1 << WR_SHIFT; | 
|---|
|  | 1055 | +		} else if (time_after(ci->i_last_wr, idle_cutoff)) { | 
|---|
|  | 1056 | +			bits |= 1 << WR_SHIFT; | 
|---|
|  | 1057 | +		} | 
|---|
|  | 1058 | + | 
|---|
|  | 1059 | +		/* check lazyio only when read/write is wanted */ | 
|---|
|  | 1060 | +		if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && | 
|---|
|  | 1061 | +		    ci->i_nr_by_mode[LAZY_SHIFT] > 0) | 
|---|
|  | 1062 | +			bits |= 1 << LAZY_SHIFT; | 
|---|
|  | 1063 | + | 
|---|
|  | 1064 | +		return bits ? ceph_caps_for_mode(bits >> 1) : 0; | 
|---|
| 974 | 1065 | } | 
|---|
| 975 |  | -	if (bits == 0) | 
|---|
| 976 |  | -		return 0; | 
|---|
| 977 |  | -	return ceph_caps_for_mode(bits >> 1); | 
|---|
|  | 1066 | +} | 
|---|
|  | 1067 | + | 
|---|
|  | 1068 | +/* | 
|---|
|  | 1069 | + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) | 
|---|
|  | 1070 | + */ | 
|---|
|  | 1071 | +int __ceph_caps_wanted(struct ceph_inode_info *ci) | 
|---|
|  | 1072 | +{ | 
|---|
|  | 1073 | +	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); | 
|---|
|  | 1074 | +	if (S_ISDIR(ci->vfs_inode.i_mode)) { | 
|---|
|  | 1075 | +		/* we want EXCL if holding caps of dir ops */ | 
|---|
|  | 1076 | +		if (w & CEPH_CAP_ANY_DIR_OPS) | 
|---|
|  | 1077 | +			w |= CEPH_CAP_FILE_EXCL; | 
|---|
|  | 1078 | +	} else { | 
|---|
|  | 1079 | +		/* we want EXCL if dirty data */ | 
|---|
|  | 1080 | +		if (w & CEPH_CAP_FILE_BUFFER) | 
|---|
|  | 1081 | +			w |= CEPH_CAP_FILE_EXCL; | 
|---|
|  | 1082 | +	} | 
|---|
|  | 1083 | +	return w; | 
|---|
| 978 | 1084 | } | 
|---|
| 979 | 1085 |  | 
|---|
| 980 | 1086 | /* | 
|---|
| .. | .. | 
|---|
| 998 | 1104 | return mds_wanted; | 
|---|
| 999 | 1105 | } | 
|---|
| 1000 | 1106 |  | 
|---|
| 1001 |  | -/* | 
|---|
| 1002 |  | - * called under i_ceph_lock | 
|---|
| 1003 |  | - */ | 
|---|
| 1004 |  | -static int __ceph_is_single_caps(struct ceph_inode_info *ci) | 
|---|
| 1005 |  | -{ | 
|---|
| 1006 |  | -	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); | 
|---|
| 1007 |  | -} | 
|---|
| 1008 |  | - | 
|---|
| 1009 |  | -static int __ceph_is_any_caps(struct ceph_inode_info *ci) | 
|---|
| 1010 |  | -{ | 
|---|
| 1011 |  | -	return !RB_EMPTY_ROOT(&ci->i_caps); | 
|---|
| 1012 |  | -} | 
|---|
| 1013 |  | - | 
|---|
| 1014 | 1107 | int ceph_is_any_caps(struct inode *inode) | 
|---|
| 1015 | 1108 | { | 
|---|
| 1016 | 1109 | struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 1017 | 1110 | int ret; | 
|---|
| 1018 | 1111 |  | 
|---|
| 1019 | 1112 | spin_lock(&ci->i_ceph_lock); | 
|---|
| 1020 |  | -	ret = __ceph_is_any_caps(ci); | 
|---|
|  | 1113 | +	ret = __ceph_is_any_real_caps(ci); | 
|---|
| 1021 | 1114 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| 1022 | 1115 |  | 
|---|
| 1023 | 1116 | return ret; | 
|---|
| .. | .. | 
|---|
| 1062 | 1155 |  | 
|---|
| 1063 | 1156 | /* remove from inode's cap rbtree, and clear auth cap */ | 
|---|
| 1064 | 1157 | rb_erase(&cap->ci_node, &ci->i_caps); | 
|---|
| 1065 |  | -	if (ci->i_auth_cap == cap) | 
|---|
|  | 1158 | +	if (ci->i_auth_cap == cap) { | 
|---|
|  | 1159 | +		WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); | 
|---|
| 1066 | 1160 | ci->i_auth_cap = NULL; | 
|---|
|  | 1161 | +	} | 
|---|
| 1067 | 1162 |  | 
|---|
| 1068 | 1163 | /* remove from session list */ | 
|---|
| 1069 | 1164 | spin_lock(&session->s_cap_lock); | 
|---|
| .. | .. | 
|---|
| 1074 | 1169 | } else { | 
|---|
| 1075 | 1170 | list_del_init(&cap->session_caps); | 
|---|
| 1076 | 1171 | session->s_nr_caps--; | 
|---|
|  | 1172 | +		atomic64_dec(&mdsc->metric.total_caps); | 
|---|
| 1077 | 1173 | cap->session = NULL; | 
|---|
| 1078 | 1174 | removed = 1; | 
|---|
| 1079 | 1175 | } | 
|---|
| .. | .. | 
|---|
| 1088 | 1184 | (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { | 
|---|
| 1089 | 1185 | cap->queue_release = 1; | 
|---|
| 1090 | 1186 | if (removed) { | 
|---|
| 1091 |  | -			list_add_tail(&cap->session_caps, | 
|---|
| 1092 |  | -				      &session->s_cap_releases); | 
|---|
| 1093 |  | -			session->s_num_cap_releases++; | 
|---|
|  | 1187 | +			__ceph_queue_cap_release(session, cap); | 
|---|
| 1094 | 1188 | removed = 0; | 
|---|
| 1095 | 1189 | } | 
|---|
| 1096 | 1190 | } else { | 
|---|
| .. | .. | 
|---|
| 1103 | 1197 | if (removed) | 
|---|
| 1104 | 1198 | ceph_put_cap(mdsc, cap); | 
|---|
| 1105 | 1199 |  | 
|---|
| 1106 |  | -	/* when reconnect denied, we remove session caps forcibly, | 
|---|
| 1107 |  | -	 * i_wr_ref can be non-zero. If there are ongoing write, | 
|---|
| 1108 |  | -	 * keep i_snap_realm. | 
|---|
| 1109 |  | -	 */ | 
|---|
| 1110 |  | -	if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) | 
|---|
| 1111 |  | -		drop_inode_snap_realm(ci); | 
|---|
|  | 1200 | +	if (!__ceph_is_any_real_caps(ci)) { | 
|---|
|  | 1201 | +		/* when reconnect denied, we remove session caps forcibly, | 
|---|
|  | 1202 | +		 * i_wr_ref can be non-zero. If there are ongoing write, | 
|---|
|  | 1203 | +		 * keep i_snap_realm. | 
|---|
|  | 1204 | +		 */ | 
|---|
|  | 1205 | +		if (ci->i_wr_ref == 0 && ci->i_snap_realm) | 
|---|
|  | 1206 | +			drop_inode_snap_realm(ci); | 
|---|
| 1112 | 1207 |  | 
|---|
| 1113 |  | -	if (!__ceph_is_any_real_caps(ci)) | 
|---|
| 1114 | 1208 | __cap_delay_cancel(mdsc, ci); | 
|---|
|  | 1209 | +	} | 
|---|
| 1115 | 1210 | } | 
|---|
| 1116 | 1211 |  | 
|---|
| 1117 | 1212 | struct cap_msg_args { | 
|---|
| .. | .. | 
|---|
| 1119 | 1214 | u64			ino, cid, follows; | 
|---|
| 1120 | 1215 | u64			flush_tid, oldest_flush_tid, size, max_size; | 
|---|
| 1121 | 1216 | u64			xattr_version; | 
|---|
|  | 1217 | +	u64			change_attr; | 
|---|
| 1122 | 1218 | struct ceph_buffer	*xattr_buf; | 
|---|
| 1123 |  | -	struct timespec64	atime, mtime, ctime; | 
|---|
|  | 1219 | +	struct ceph_buffer	*old_xattr_buf; | 
|---|
|  | 1220 | +	struct timespec64	atime, mtime, ctime, btime; | 
|---|
| 1124 | 1221 | int			op, caps, wanted, dirty; | 
|---|
| 1125 | 1222 | u32			seq, issue_seq, mseq, time_warp_seq; | 
|---|
| 1126 | 1223 | u32			flags; | 
|---|
| .. | .. | 
|---|
| 1128 | 1225 | kgid_t			gid; | 
|---|
| 1129 | 1226 | umode_t			mode; | 
|---|
| 1130 | 1227 | bool			inline_data; | 
|---|
|  | 1228 | +	bool			wake; | 
|---|
| 1131 | 1229 | }; | 
|---|
| 1132 | 1230 |  | 
|---|
| 1133 | 1231 | /* | 
|---|
| 1134 |  | - * Build and send a cap message to the given MDS. | 
|---|
| 1135 |  | - * | 
|---|
| 1136 |  | - * Caller should be holding s_mutex. | 
|---|
|  | 1232 | + * cap struct size + flock buffer size + inline version + inline data size + | 
|---|
|  | 1233 | + * osd_epoch_barrier + oldest_flush_tid | 
|---|
| 1137 | 1234 | */ | 
|---|
| 1138 |  | -static int send_cap_msg(struct cap_msg_args *arg) | 
|---|
|  | 1235 | +#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ | 
|---|
|  | 1236 | +		      4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) | 
|---|
|  | 1237 | + | 
|---|
|  | 1238 | +/* Marshal up the cap msg to the MDS */ | 
|---|
|  | 1239 | +static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) | 
|---|
| 1139 | 1240 | { | 
|---|
| 1140 | 1241 | struct ceph_mds_caps *fc; | 
|---|
| 1141 |  | -	struct ceph_msg *msg; | 
|---|
| 1142 | 1242 | void *p; | 
|---|
| 1143 |  | -	size_t extra_len; | 
|---|
| 1144 |  | -	struct timespec64 zerotime = {0}; | 
|---|
| 1145 | 1243 | struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; | 
|---|
| 1146 | 1244 |  | 
|---|
| 1147 |  | -	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" | 
|---|
| 1148 |  | -	     " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" | 
|---|
| 1149 |  | -	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), | 
|---|
| 1150 |  | -	     arg->cid, arg->ino, ceph_cap_string(arg->caps), | 
|---|
| 1151 |  | -	     ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), | 
|---|
| 1152 |  | -	     arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, | 
|---|
| 1153 |  | -	     arg->mseq, arg->follows, arg->size, arg->max_size, | 
|---|
| 1154 |  | -	     arg->xattr_version, | 
|---|
|  | 1245 | +	dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", | 
|---|
|  | 1246 | +	     __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, | 
|---|
|  | 1247 | +	     ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), | 
|---|
|  | 1248 | +	     ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, | 
|---|
|  | 1249 | +	     arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, | 
|---|
|  | 1250 | +	     arg->size, arg->max_size, arg->xattr_version, | 
|---|
| 1155 | 1251 | arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); | 
|---|
| 1156 |  | - | 
|---|
| 1157 |  | -	/* flock buffer size + inline version + inline data size + | 
|---|
| 1158 |  | -	 * osd_epoch_barrier + oldest_flush_tid */ | 
|---|
| 1159 |  | -	extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4; | 
|---|
| 1160 |  | -	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, | 
|---|
| 1161 |  | -			   GFP_NOFS, false); | 
|---|
| 1162 |  | -	if (!msg) | 
|---|
| 1163 |  | -		return -ENOMEM; | 
|---|
| 1164 | 1252 |  | 
|---|
| 1165 | 1253 | msg->hdr.version = cpu_to_le16(10); | 
|---|
| 1166 | 1254 | msg->hdr.tid = cpu_to_le64(arg->flush_tid); | 
|---|
| .. | .. | 
|---|
| 1226 | 1314 | /* pool namespace (version 8) (mds always ignores this) */ | 
|---|
| 1227 | 1315 | ceph_encode_32(&p, 0); | 
|---|
| 1228 | 1316 |  | 
|---|
| 1229 |  | -	/* | 
|---|
| 1230 |  | -	 * btime and change_attr (version 9) | 
|---|
| 1231 |  | -	 * | 
|---|
| 1232 |  | -	 * We just zero these out for now, as the MDS ignores them unless | 
|---|
| 1233 |  | -	 * the requisite feature flags are set (which we don't do yet). | 
|---|
| 1234 |  | -	 */ | 
|---|
| 1235 |  | -	ceph_encode_timespec64(p, &zerotime); | 
|---|
|  | 1317 | +	/* btime and change_attr (version 9) */ | 
|---|
|  | 1318 | +	ceph_encode_timespec64(p, &arg->btime); | 
|---|
| 1236 | 1319 | p += sizeof(struct ceph_timespec); | 
|---|
| 1237 |  | -	ceph_encode_64(&p, 0); | 
|---|
|  | 1320 | +	ceph_encode_64(&p, arg->change_attr); | 
|---|
| 1238 | 1321 |  | 
|---|
| 1239 | 1322 | /* Advisory flags (version 10) */ | 
|---|
| 1240 | 1323 | ceph_encode_32(&p, arg->flags); | 
|---|
| 1241 |  | - | 
|---|
| 1242 |  | -	ceph_con_send(&arg->session->s_con, msg); | 
|---|
| 1243 |  | -	return 0; | 
|---|
| 1244 | 1324 | } | 
|---|
| 1245 | 1325 |  | 
|---|
| 1246 | 1326 | /* | 
|---|
| 1247 | 1327 | * Queue cap releases when an inode is dropped from our cache. | 
|---|
| 1248 | 1328 | */ | 
|---|
| 1249 |  | -void ceph_queue_caps_release(struct inode *inode) | 
|---|
|  | 1329 | +void __ceph_remove_caps(struct ceph_inode_info *ci) | 
|---|
| 1250 | 1330 | { | 
|---|
| 1251 |  | -	struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 1252 | 1331 | struct rb_node *p; | 
|---|
| 1253 | 1332 |  | 
|---|
| 1254 | 1333 | /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) | 
|---|
| .. | .. | 
|---|
| 1264 | 1343 | } | 
|---|
| 1265 | 1344 |  | 
|---|
| 1266 | 1345 | /* | 
|---|
| 1267 |  | - * Send a cap msg on the given inode.  Update our caps state, then | 
|---|
| 1268 |  | - * drop i_ceph_lock and send the message. | 
|---|
|  | 1346 | + * Prepare to send a cap message to an MDS. Update the cap state, and populate | 
|---|
|  | 1347 | + * the arg struct with the parameters that will need to be sent. This should | 
|---|
|  | 1348 | + * be done under the i_ceph_lock to guard against changes to cap state. | 
|---|
| 1269 | 1349 | * | 
|---|
| 1270 | 1350 | * Make note of max_size reported/requested from mds, revoked caps | 
|---|
| 1271 | 1351 | * that have now been implemented. | 
|---|
| 1272 |  | - * | 
|---|
| 1273 |  | - * Make half-hearted attempt ot to invalidate page cache if we are | 
|---|
| 1274 |  | - * dropping RDCACHE.  Note that this will leave behind locked pages | 
|---|
| 1275 |  | - * that we'll then need to deal with elsewhere. | 
|---|
| 1276 |  | - * | 
|---|
| 1277 |  | - * Return non-zero if delayed release, or we experienced an error | 
|---|
| 1278 |  | - * such that the caller should requeue + retry later. | 
|---|
| 1279 |  | - * | 
|---|
| 1280 |  | - * called with i_ceph_lock, then drops it. | 
|---|
| 1281 |  | - * caller should hold snap_rwsem (read), s_mutex. | 
|---|
| 1282 | 1352 | */ | 
|---|
| 1283 |  | -static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | 
|---|
| 1284 |  | -		      int op, bool sync, int used, int want, int retain, | 
|---|
| 1285 |  | -		      int flushing, u64 flush_tid, u64 oldest_flush_tid) | 
|---|
| 1286 |  | -	__releases(cap->ci->i_ceph_lock) | 
|---|
|  | 1353 | +static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, | 
|---|
|  | 1354 | +		       int op, int flags, int used, int want, int retain, | 
|---|
|  | 1355 | +		       int flushing, u64 flush_tid, u64 oldest_flush_tid) | 
|---|
| 1287 | 1356 | { | 
|---|
| 1288 | 1357 | struct ceph_inode_info *ci = cap->ci; | 
|---|
| 1289 | 1358 | struct inode *inode = &ci->vfs_inode; | 
|---|
| 1290 |  | -	struct ceph_buffer *old_blob = NULL; | 
|---|
| 1291 |  | -	struct cap_msg_args arg; | 
|---|
| 1292 | 1359 | int held, revoking; | 
|---|
| 1293 |  | -	int wake = 0; | 
|---|
| 1294 |  | -	int delayed = 0; | 
|---|
| 1295 |  | -	int ret; | 
|---|
|  | 1360 | + | 
|---|
|  | 1361 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
| 1296 | 1362 |  | 
|---|
| 1297 | 1363 | held = cap->issued | cap->implemented; | 
|---|
| 1298 | 1364 | revoking = cap->implemented & ~cap->issued; | 
|---|
| 1299 | 1365 | retain &= ~revoking; | 
|---|
| 1300 | 1366 |  | 
|---|
| 1301 |  | -	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", | 
|---|
| 1302 |  | -	     inode, cap, cap->session, | 
|---|
|  | 1367 | +	dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", | 
|---|
|  | 1368 | +	     __func__, inode, cap, cap->session, | 
|---|
| 1303 | 1369 | ceph_cap_string(held), ceph_cap_string(held & retain), | 
|---|
| 1304 | 1370 | ceph_cap_string(revoking)); | 
|---|
| 1305 | 1371 | BUG_ON((retain & CEPH_CAP_PIN) == 0); | 
|---|
| 1306 | 1372 |  | 
|---|
| 1307 |  | -	arg.session = cap->session; | 
|---|
| 1308 |  | - | 
|---|
| 1309 |  | -	/* don't release wanted unless we've waited a bit. */ | 
|---|
| 1310 |  | -	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && | 
|---|
| 1311 |  | -	    time_before(jiffies, ci->i_hold_caps_min)) { | 
|---|
| 1312 |  | -		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", | 
|---|
| 1313 |  | -		     ceph_cap_string(cap->issued), | 
|---|
| 1314 |  | -		     ceph_cap_string(cap->issued & retain), | 
|---|
| 1315 |  | -		     ceph_cap_string(cap->mds_wanted), | 
|---|
| 1316 |  | -		     ceph_cap_string(want)); | 
|---|
| 1317 |  | -		want |= cap->mds_wanted; | 
|---|
| 1318 |  | -		retain |= cap->issued; | 
|---|
| 1319 |  | -		delayed = 1; | 
|---|
| 1320 |  | -	} | 
|---|
| 1321 |  | -	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); | 
|---|
| 1322 |  | -	if (want & ~cap->mds_wanted) { | 
|---|
| 1323 |  | -		/* user space may open/close single file frequently. | 
|---|
| 1324 |  | -		 * This avoids droping mds_wanted immediately after | 
|---|
| 1325 |  | -		 * requesting new mds_wanted. | 
|---|
| 1326 |  | -		 */ | 
|---|
| 1327 |  | -		__cap_set_timeouts(mdsc, ci); | 
|---|
| 1328 |  | -	} | 
|---|
|  | 1373 | +	ci->i_ceph_flags &= ~CEPH_I_FLUSH; | 
|---|
| 1329 | 1374 |  | 
|---|
| 1330 | 1375 | cap->issued &= retain;  /* drop bits we don't want */ | 
|---|
| 1331 |  | -	if (cap->implemented & ~cap->issued) { | 
|---|
| 1332 |  | -		/* | 
|---|
| 1333 |  | -		 * Wake up any waiters on wanted -> needed transition. | 
|---|
| 1334 |  | -		 * This is due to the weird transition from buffered | 
|---|
| 1335 |  | -		 * to sync IO... we need to flush dirty pages _before_ | 
|---|
| 1336 |  | -		 * allowing sync writes to avoid reordering. | 
|---|
| 1337 |  | -		 */ | 
|---|
| 1338 |  | -		wake = 1; | 
|---|
| 1339 |  | -	} | 
|---|
|  | 1376 | +	/* | 
|---|
|  | 1377 | +	 * Wake up any waiters on wanted -> needed transition. This is due to | 
|---|
|  | 1378 | +	 * the weird transition from buffered to sync IO... we need to flush | 
|---|
|  | 1379 | +	 * dirty pages _before_ allowing sync writes to avoid reordering. | 
|---|
|  | 1380 | +	 */ | 
|---|
|  | 1381 | +	arg->wake = cap->implemented & ~cap->issued; | 
|---|
| 1340 | 1382 | cap->implemented &= cap->issued | used; | 
|---|
| 1341 | 1383 | cap->mds_wanted = want; | 
|---|
| 1342 | 1384 |  | 
|---|
| 1343 |  | -	arg.ino = ceph_vino(inode).ino; | 
|---|
| 1344 |  | -	arg.cid = cap->cap_id; | 
|---|
| 1345 |  | -	arg.follows = flushing ? ci->i_head_snapc->seq : 0; | 
|---|
| 1346 |  | -	arg.flush_tid = flush_tid; | 
|---|
| 1347 |  | -	arg.oldest_flush_tid = oldest_flush_tid; | 
|---|
|  | 1385 | +	arg->session = cap->session; | 
|---|
|  | 1386 | +	arg->ino = ceph_vino(inode).ino; | 
|---|
|  | 1387 | +	arg->cid = cap->cap_id; | 
|---|
|  | 1388 | +	arg->follows = flushing ? ci->i_head_snapc->seq : 0; | 
|---|
|  | 1389 | +	arg->flush_tid = flush_tid; | 
|---|
|  | 1390 | +	arg->oldest_flush_tid = oldest_flush_tid; | 
|---|
| 1348 | 1391 |  | 
|---|
| 1349 |  | -	arg.size = inode->i_size; | 
|---|
| 1350 |  | -	ci->i_reported_size = arg.size; | 
|---|
| 1351 |  | -	arg.max_size = ci->i_wanted_max_size; | 
|---|
| 1352 |  | -	ci->i_requested_max_size = arg.max_size; | 
|---|
|  | 1392 | +	arg->size = inode->i_size; | 
|---|
|  | 1393 | +	ci->i_reported_size = arg->size; | 
|---|
|  | 1394 | +	arg->max_size = ci->i_wanted_max_size; | 
|---|
|  | 1395 | +	if (cap == ci->i_auth_cap) { | 
|---|
|  | 1396 | +		if (want & CEPH_CAP_ANY_FILE_WR) | 
|---|
|  | 1397 | +			ci->i_requested_max_size = arg->max_size; | 
|---|
|  | 1398 | +		else | 
|---|
|  | 1399 | +			ci->i_requested_max_size = 0; | 
|---|
|  | 1400 | +	} | 
|---|
| 1353 | 1401 |  | 
|---|
| 1354 | 1402 | if (flushing & CEPH_CAP_XATTR_EXCL) { | 
|---|
| 1355 |  | -		old_blob = __ceph_build_xattrs_blob(ci); | 
|---|
| 1356 |  | -		arg.xattr_version = ci->i_xattrs.version; | 
|---|
| 1357 |  | -		arg.xattr_buf = ci->i_xattrs.blob; | 
|---|
|  | 1403 | +		arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); | 
|---|
|  | 1404 | +		arg->xattr_version = ci->i_xattrs.version; | 
|---|
|  | 1405 | +		arg->xattr_buf = ci->i_xattrs.blob; | 
|---|
| 1358 | 1406 | } else { | 
|---|
| 1359 |  | -		arg.xattr_buf = NULL; | 
|---|
|  | 1407 | +		arg->xattr_buf = NULL; | 
|---|
|  | 1408 | +		arg->old_xattr_buf = NULL; | 
|---|
| 1360 | 1409 | } | 
|---|
| 1361 | 1410 |  | 
|---|
| 1362 |  | -	arg.mtime = inode->i_mtime; | 
|---|
| 1363 |  | -	arg.atime = inode->i_atime; | 
|---|
| 1364 |  | -	arg.ctime = inode->i_ctime; | 
|---|
|  | 1411 | +	arg->mtime = inode->i_mtime; | 
|---|
|  | 1412 | +	arg->atime = inode->i_atime; | 
|---|
|  | 1413 | +	arg->ctime = inode->i_ctime; | 
|---|
|  | 1414 | +	arg->btime = ci->i_btime; | 
|---|
|  | 1415 | +	arg->change_attr = inode_peek_iversion_raw(inode); | 
|---|
| 1365 | 1416 |  | 
|---|
| 1366 |  | -	arg.op = op; | 
|---|
| 1367 |  | -	arg.caps = cap->implemented; | 
|---|
| 1368 |  | -	arg.wanted = want; | 
|---|
| 1369 |  | -	arg.dirty = flushing; | 
|---|
|  | 1417 | +	arg->op = op; | 
|---|
|  | 1418 | +	arg->caps = cap->implemented; | 
|---|
|  | 1419 | +	arg->wanted = want; | 
|---|
|  | 1420 | +	arg->dirty = flushing; | 
|---|
| 1370 | 1421 |  | 
|---|
| 1371 |  | -	arg.seq = cap->seq; | 
|---|
| 1372 |  | -	arg.issue_seq = cap->issue_seq; | 
|---|
| 1373 |  | -	arg.mseq = cap->mseq; | 
|---|
| 1374 |  | -	arg.time_warp_seq = ci->i_time_warp_seq; | 
|---|
|  | 1422 | +	arg->seq = cap->seq; | 
|---|
|  | 1423 | +	arg->issue_seq = cap->issue_seq; | 
|---|
|  | 1424 | +	arg->mseq = cap->mseq; | 
|---|
|  | 1425 | +	arg->time_warp_seq = ci->i_time_warp_seq; | 
|---|
| 1375 | 1426 |  | 
|---|
| 1376 |  | -	arg.uid = inode->i_uid; | 
|---|
| 1377 |  | -	arg.gid = inode->i_gid; | 
|---|
| 1378 |  | -	arg.mode = inode->i_mode; | 
|---|
|  | 1427 | +	arg->uid = inode->i_uid; | 
|---|
|  | 1428 | +	arg->gid = inode->i_gid; | 
|---|
|  | 1429 | +	arg->mode = inode->i_mode; | 
|---|
| 1379 | 1430 |  | 
|---|
| 1380 |  | -	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; | 
|---|
| 1381 |  | -	if (list_empty(&ci->i_cap_snaps)) | 
|---|
| 1382 |  | -		arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; | 
|---|
| 1383 |  | -	else | 
|---|
| 1384 |  | -		arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; | 
|---|
| 1385 |  | -	if (sync) | 
|---|
| 1386 |  | -		arg.flags |= CEPH_CLIENT_CAPS_SYNC; | 
|---|
|  | 1431 | +	arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; | 
|---|
|  | 1432 | +	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && | 
|---|
|  | 1433 | +	    !list_empty(&ci->i_cap_snaps)) { | 
|---|
|  | 1434 | +		struct ceph_cap_snap *capsnap; | 
|---|
|  | 1435 | +		list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { | 
|---|
|  | 1436 | +			if (capsnap->cap_flush.tid) | 
|---|
|  | 1437 | +				break; | 
|---|
|  | 1438 | +			if (capsnap->need_flush) { | 
|---|
|  | 1439 | +				flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; | 
|---|
|  | 1440 | +				break; | 
|---|
|  | 1441 | +			} | 
|---|
|  | 1442 | +		} | 
|---|
|  | 1443 | +	} | 
|---|
|  | 1444 | +	arg->flags = flags; | 
|---|
|  | 1445 | +} | 
|---|
| 1387 | 1446 |  | 
|---|
| 1388 |  | -	spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 1447 | +/* | 
|---|
|  | 1448 | + * Send a cap msg on the given inode. | 
|---|
|  | 1449 | + * | 
|---|
|  | 1450 | + * Caller should hold snap_rwsem (read), s_mutex. | 
|---|
|  | 1451 | + */ | 
|---|
|  | 1452 | +static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) | 
|---|
|  | 1453 | +{ | 
|---|
|  | 1454 | +	struct ceph_msg *msg; | 
|---|
|  | 1455 | +	struct inode *inode = &ci->vfs_inode; | 
|---|
| 1389 | 1456 |  | 
|---|
| 1390 |  | -	ceph_buffer_put(old_blob); | 
|---|
| 1391 |  | - | 
|---|
| 1392 |  | -	ret = send_cap_msg(&arg); | 
|---|
| 1393 |  | -	if (ret < 0) { | 
|---|
| 1394 |  | -		dout("error sending cap msg, must requeue %p\n", inode); | 
|---|
| 1395 |  | -		delayed = 1; | 
|---|
|  | 1457 | +	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); | 
|---|
|  | 1458 | +	if (!msg) { | 
|---|
|  | 1459 | +		pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", | 
|---|
|  | 1460 | +		       ceph_vinop(inode), ceph_cap_string(arg->dirty), | 
|---|
|  | 1461 | +		       arg->flush_tid); | 
|---|
|  | 1462 | +		spin_lock(&ci->i_ceph_lock); | 
|---|
|  | 1463 | +		__cap_delay_requeue(arg->session->s_mdsc, ci); | 
|---|
|  | 1464 | +		spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 1465 | +		return; | 
|---|
| 1396 | 1466 | } | 
|---|
| 1397 | 1467 |  | 
|---|
| 1398 |  | -	if (wake) | 
|---|
|  | 1468 | +	encode_cap_msg(msg, arg); | 
|---|
|  | 1469 | +	ceph_con_send(&arg->session->s_con, msg); | 
|---|
|  | 1470 | +	ceph_buffer_put(arg->old_xattr_buf); | 
|---|
|  | 1471 | +	if (arg->wake) | 
|---|
| 1399 | 1472 | wake_up_all(&ci->i_cap_wq); | 
|---|
| 1400 |  | - | 
|---|
| 1401 |  | -	return delayed; | 
|---|
| 1402 | 1473 | } | 
|---|
| 1403 | 1474 |  | 
|---|
| 1404 | 1475 | static inline int __send_flush_snap(struct inode *inode, | 
|---|
| .. | .. | 
|---|
| 1407 | 1478 | u32 mseq, u64 oldest_flush_tid) | 
|---|
| 1408 | 1479 | { | 
|---|
| 1409 | 1480 | struct cap_msg_args	arg; | 
|---|
|  | 1481 | +	struct ceph_msg		*msg; | 
|---|
|  | 1482 | + | 
|---|
|  | 1483 | +	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); | 
|---|
|  | 1484 | +	if (!msg) | 
|---|
|  | 1485 | +		return -ENOMEM; | 
|---|
| 1410 | 1486 |  | 
|---|
| 1411 | 1487 | arg.session = session; | 
|---|
| 1412 | 1488 | arg.ino = ceph_vino(inode).ino; | 
|---|
| .. | .. | 
|---|
| 1419 | 1495 | arg.max_size = 0; | 
|---|
| 1420 | 1496 | arg.xattr_version = capsnap->xattr_version; | 
|---|
| 1421 | 1497 | arg.xattr_buf = capsnap->xattr_blob; | 
|---|
|  | 1498 | +	arg.old_xattr_buf = NULL; | 
|---|
| 1422 | 1499 |  | 
|---|
| 1423 | 1500 | arg.atime = capsnap->atime; | 
|---|
| 1424 | 1501 | arg.mtime = capsnap->mtime; | 
|---|
| 1425 | 1502 | arg.ctime = capsnap->ctime; | 
|---|
|  | 1503 | +	arg.btime = capsnap->btime; | 
|---|
|  | 1504 | +	arg.change_attr = capsnap->change_attr; | 
|---|
| 1426 | 1505 |  | 
|---|
| 1427 | 1506 | arg.op = CEPH_CAP_OP_FLUSHSNAP; | 
|---|
| 1428 | 1507 | arg.caps = capsnap->issued; | 
|---|
| .. | .. | 
|---|
| 1440 | 1519 |  | 
|---|
| 1441 | 1520 | arg.inline_data = capsnap->inline_data; | 
|---|
| 1442 | 1521 | arg.flags = 0; | 
|---|
|  | 1522 | +	arg.wake = false; | 
|---|
| 1443 | 1523 |  | 
|---|
| 1444 |  | -	return send_cap_msg(&arg); | 
|---|
|  | 1524 | +	encode_cap_msg(msg, &arg); | 
|---|
|  | 1525 | +	ceph_con_send(&arg.session->s_con, msg); | 
|---|
|  | 1526 | +	return 0; | 
|---|
| 1445 | 1527 | } | 
|---|
| 1446 | 1528 |  | 
|---|
| 1447 | 1529 | /* | 
|---|
| .. | .. | 
|---|
| 1590 | 1672 | } | 
|---|
| 1591 | 1673 |  | 
|---|
| 1592 | 1674 | // make sure flushsnap messages are sent in proper order. | 
|---|
| 1593 |  | -	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { | 
|---|
|  | 1675 | +	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) | 
|---|
| 1594 | 1676 | __kick_flushing_caps(mdsc, session, ci, 0); | 
|---|
| 1595 |  | -		ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; | 
|---|
| 1596 |  | -	} | 
|---|
| 1597 | 1677 |  | 
|---|
| 1598 | 1678 | __ceph_flush_snaps(ci, session); | 
|---|
| 1599 | 1679 | out: | 
|---|
| .. | .. | 
|---|
| 1625 | 1705 | int was = ci->i_dirty_caps; | 
|---|
| 1626 | 1706 | int dirty = 0; | 
|---|
| 1627 | 1707 |  | 
|---|
|  | 1708 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
|  | 1709 | + | 
|---|
| 1628 | 1710 | if (!ci->i_auth_cap) { | 
|---|
| 1629 | 1711 | pr_warn("__mark_dirty_caps %p %llx mask %s, " | 
|---|
| 1630 | 1712 | "but no auth cap (session was closed?)\n", | 
|---|
| .. | .. | 
|---|
| 1637 | 1719 | ceph_cap_string(was | mask)); | 
|---|
| 1638 | 1720 | ci->i_dirty_caps |= mask; | 
|---|
| 1639 | 1721 | if (was == 0) { | 
|---|
|  | 1722 | +		struct ceph_mds_session *session = ci->i_auth_cap->session; | 
|---|
|  | 1723 | + | 
|---|
| 1640 | 1724 | WARN_ON_ONCE(ci->i_prealloc_cap_flush); | 
|---|
| 1641 | 1725 | swap(ci->i_prealloc_cap_flush, *pcf); | 
|---|
| 1642 | 1726 |  | 
|---|
| .. | .. | 
|---|
| 1649 | 1733 | &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); | 
|---|
| 1650 | 1734 | BUG_ON(!list_empty(&ci->i_dirty_item)); | 
|---|
| 1651 | 1735 | spin_lock(&mdsc->cap_dirty_lock); | 
|---|
| 1652 |  | -		list_add(&ci->i_dirty_item, &mdsc->cap_dirty); | 
|---|
|  | 1736 | +		list_add(&ci->i_dirty_item, &session->s_cap_dirty); | 
|---|
| 1653 | 1737 | spin_unlock(&mdsc->cap_dirty_lock); | 
|---|
| 1654 | 1738 | if (ci->i_flushing_caps == 0) { | 
|---|
| 1655 | 1739 | ihold(inode); | 
|---|
| .. | .. | 
|---|
| 1668 | 1752 |  | 
|---|
| 1669 | 1753 | struct ceph_cap_flush *ceph_alloc_cap_flush(void) | 
|---|
| 1670 | 1754 | { | 
|---|
| 1671 |  | -	return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); | 
|---|
|  | 1755 | +	struct ceph_cap_flush *cf; | 
|---|
|  | 1756 | + | 
|---|
|  | 1757 | +	cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); | 
|---|
|  | 1758 | +	if (!cf) | 
|---|
|  | 1759 | +		return NULL; | 
|---|
|  | 1760 | + | 
|---|
|  | 1761 | +	cf->is_capsnap = false; | 
|---|
|  | 1762 | +	return cf; | 
|---|
| 1672 | 1763 | } | 
|---|
| 1673 | 1764 |  | 
|---|
| 1674 | 1765 | void ceph_free_cap_flush(struct ceph_cap_flush *cf) | 
|---|
| .. | .. | 
|---|
| 1692 | 1783 | * Remove cap_flush from the mdsc's or inode's flushing cap list. | 
|---|
| 1693 | 1784 | * Return true if caller needs to wake up flush waiters. | 
|---|
| 1694 | 1785 | */ | 
|---|
| 1695 |  | -static bool __finish_cap_flush(struct ceph_mds_client *mdsc, | 
|---|
| 1696 |  | -			       struct ceph_inode_info *ci, | 
|---|
| 1697 |  | -			       struct ceph_cap_flush *cf) | 
|---|
|  | 1786 | +static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, | 
|---|
|  | 1787 | +					 struct ceph_cap_flush *cf) | 
|---|
| 1698 | 1788 | { | 
|---|
| 1699 | 1789 | struct ceph_cap_flush *prev; | 
|---|
| 1700 | 1790 | bool wake = cf->wake; | 
|---|
| 1701 |  | -	if (mdsc) { | 
|---|
| 1702 |  | -		/* are there older pending cap flushes? */ | 
|---|
| 1703 |  | -		if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { | 
|---|
| 1704 |  | -			prev = list_prev_entry(cf, g_list); | 
|---|
| 1705 |  | -			prev->wake = true; | 
|---|
| 1706 |  | -			wake = false; | 
|---|
| 1707 |  | -		} | 
|---|
| 1708 |  | -		list_del(&cf->g_list); | 
|---|
| 1709 |  | -	} else if (ci) { | 
|---|
| 1710 |  | -		if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { | 
|---|
| 1711 |  | -			prev = list_prev_entry(cf, i_list); | 
|---|
| 1712 |  | -			prev->wake = true; | 
|---|
| 1713 |  | -			wake = false; | 
|---|
| 1714 |  | -		} | 
|---|
| 1715 |  | -		list_del(&cf->i_list); | 
|---|
| 1716 |  | -	} else { | 
|---|
| 1717 |  | -		BUG_ON(1); | 
|---|
|  | 1791 | + | 
|---|
|  | 1792 | +	if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { | 
|---|
|  | 1793 | +		prev = list_prev_entry(cf, g_list); | 
|---|
|  | 1794 | +		prev->wake = true; | 
|---|
|  | 1795 | +		wake = false; | 
|---|
| 1718 | 1796 | } | 
|---|
|  | 1797 | +	list_del_init(&cf->g_list); | 
|---|
|  | 1798 | +	return wake; | 
|---|
|  | 1799 | +} | 
|---|
|  | 1800 | + | 
|---|
|  | 1801 | +static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, | 
|---|
|  | 1802 | +				       struct ceph_cap_flush *cf) | 
|---|
|  | 1803 | +{ | 
|---|
|  | 1804 | +	struct ceph_cap_flush *prev; | 
|---|
|  | 1805 | +	bool wake = cf->wake; | 
|---|
|  | 1806 | + | 
|---|
|  | 1807 | +	if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { | 
|---|
|  | 1808 | +		prev = list_prev_entry(cf, i_list); | 
|---|
|  | 1809 | +		prev->wake = true; | 
|---|
|  | 1810 | +		wake = false; | 
|---|
|  | 1811 | +	} | 
|---|
|  | 1812 | +	list_del_init(&cf->i_list); | 
|---|
| 1719 | 1813 | return wake; | 
|---|
| 1720 | 1814 | } | 
|---|
| 1721 | 1815 |  | 
|---|
| .. | .. | 
|---|
| 1723 | 1817 | * Add dirty inode to the flushing list.  Assigned a seq number so we | 
|---|
| 1724 | 1818 | * can wait for caps to flush without starving. | 
|---|
| 1725 | 1819 | * | 
|---|
| 1726 |  | - * Called under i_ceph_lock. | 
|---|
|  | 1820 | + * Called under i_ceph_lock. Returns the flush tid. | 
|---|
| 1727 | 1821 | */ | 
|---|
| 1728 |  | -static int __mark_caps_flushing(struct inode *inode, | 
|---|
|  | 1822 | +static u64 __mark_caps_flushing(struct inode *inode, | 
|---|
| 1729 | 1823 | struct ceph_mds_session *session, bool wake, | 
|---|
| 1730 |  | -				u64 *flush_tid, u64 *oldest_flush_tid) | 
|---|
|  | 1824 | +				u64 *oldest_flush_tid) | 
|---|
| 1731 | 1825 | { | 
|---|
| 1732 | 1826 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 
|---|
| 1733 | 1827 | struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 1734 | 1828 | struct ceph_cap_flush *cf = NULL; | 
|---|
| 1735 | 1829 | int flushing; | 
|---|
| 1736 | 1830 |  | 
|---|
|  | 1831 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
| 1737 | 1832 | BUG_ON(ci->i_dirty_caps == 0); | 
|---|
| 1738 | 1833 | BUG_ON(list_empty(&ci->i_dirty_item)); | 
|---|
| 1739 | 1834 | BUG_ON(!ci->i_prealloc_cap_flush); | 
|---|
| .. | .. | 
|---|
| 1766 | 1861 |  | 
|---|
| 1767 | 1862 | list_add_tail(&cf->i_list, &ci->i_cap_flush_list); | 
|---|
| 1768 | 1863 |  | 
|---|
| 1769 |  | -	*flush_tid = cf->tid; | 
|---|
| 1770 |  | -	return flushing; | 
|---|
|  | 1864 | +	return cf->tid; | 
|---|
| 1771 | 1865 | } | 
|---|
| 1772 | 1866 |  | 
|---|
| 1773 | 1867 | /* | 
|---|
| .. | .. | 
|---|
| 1817 | 1911 | * versus held caps.  Release, flush, ack revoked caps to mds as | 
|---|
| 1818 | 1912 | * appropriate. | 
|---|
| 1819 | 1913 | * | 
|---|
| 1820 |  | - *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay | 
|---|
| 1821 |  | - *    cap release further. | 
|---|
| 1822 | 1914 | *  CHECK_CAPS_AUTHONLY - we should only check the auth cap | 
|---|
| 1823 | 1915 | *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without | 
|---|
| 1824 | 1916 | *    further delay. | 
|---|
| .. | .. | 
|---|
| 1826 | 1918 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, | 
|---|
| 1827 | 1919 | struct ceph_mds_session *session) | 
|---|
| 1828 | 1920 | { | 
|---|
| 1829 |  | -	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); | 
|---|
| 1830 |  | -	struct ceph_mds_client *mdsc = fsc->mdsc; | 
|---|
| 1831 | 1921 | struct inode *inode = &ci->vfs_inode; | 
|---|
|  | 1922 | +	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); | 
|---|
| 1832 | 1923 | struct ceph_cap *cap; | 
|---|
| 1833 | 1924 | u64 flush_tid, oldest_flush_tid; | 
|---|
| 1834 | 1925 | int file_wanted, used, cap_used; | 
|---|
| .. | .. | 
|---|
| 1837 | 1928 | int mds = -1;   /* keep track of how far we've gone through i_caps list | 
|---|
| 1838 | 1929 | to avoid an infinite loop on retry */ | 
|---|
| 1839 | 1930 | struct rb_node *p; | 
|---|
| 1840 |  | -	int delayed = 0, sent = 0; | 
|---|
| 1841 |  | -	bool no_delay = flags & CHECK_CAPS_NODELAY; | 
|---|
| 1842 | 1931 | bool queue_invalidate = false; | 
|---|
| 1843 | 1932 | bool tried_invalidate = false; | 
|---|
| 1844 | 1933 |  | 
|---|
| 1845 |  | -	/* if we are unmounting, flush any unused caps immediately. */ | 
|---|
| 1846 |  | -	if (mdsc->stopping) | 
|---|
| 1847 |  | -		no_delay = true; | 
|---|
| 1848 |  | - | 
|---|
| 1849 | 1934 | spin_lock(&ci->i_ceph_lock); | 
|---|
| 1850 |  | - | 
|---|
| 1851 | 1935 | if (ci->i_ceph_flags & CEPH_I_FLUSH) | 
|---|
| 1852 | 1936 | flags |= CHECK_CAPS_FLUSH; | 
|---|
| 1853 |  | - | 
|---|
| 1854 |  | -	if (!(flags & CHECK_CAPS_AUTHONLY) || | 
|---|
| 1855 |  | -	    (ci->i_auth_cap && __ceph_is_single_caps(ci))) | 
|---|
| 1856 |  | -		__cap_delay_cancel(mdsc, ci); | 
|---|
| 1857 | 1937 |  | 
|---|
| 1858 | 1938 | goto retry_locked; | 
|---|
| 1859 | 1939 | retry: | 
|---|
| 1860 | 1940 | spin_lock(&ci->i_ceph_lock); | 
|---|
| 1861 | 1941 | retry_locked: | 
|---|
|  | 1942 | +	/* Caps wanted by virtue of active open files. */ | 
|---|
| 1862 | 1943 | file_wanted = __ceph_caps_file_wanted(ci); | 
|---|
|  | 1944 | + | 
|---|
|  | 1945 | +	/* Caps which have active references against them */ | 
|---|
| 1863 | 1946 | used = __ceph_caps_used(ci); | 
|---|
|  | 1947 | + | 
|---|
|  | 1948 | +	/* | 
|---|
|  | 1949 | +	 * "issued" represents the current caps that the MDS wants us to have. | 
|---|
|  | 1950 | +	 * "implemented" is the set that we have been granted, and includes the | 
|---|
|  | 1951 | +	 * ones that have not yet been returned to the MDS (the "revoking" set, | 
|---|
|  | 1952 | +	 * usually because they have outstanding references). | 
|---|
|  | 1953 | +	 */ | 
|---|
| 1864 | 1954 | issued = __ceph_caps_issued(ci, &implemented); | 
|---|
| 1865 | 1955 | revoking = implemented & ~issued; | 
|---|
| 1866 | 1956 |  | 
|---|
| 1867 | 1957 | want = file_wanted; | 
|---|
|  | 1958 | + | 
|---|
|  | 1959 | +	/* The ones we currently want to retain (may be adjusted below) */ | 
|---|
| 1868 | 1960 | retain = file_wanted | used | CEPH_CAP_PIN; | 
|---|
| 1869 | 1961 | if (!mdsc->stopping && inode->i_nlink > 0) { | 
|---|
| 1870 | 1962 | if (file_wanted) { | 
|---|
| 1871 | 1963 | retain |= CEPH_CAP_ANY;       /* be greedy */ | 
|---|
| 1872 | 1964 | } else if (S_ISDIR(inode->i_mode) && | 
|---|
| 1873 | 1965 | (issued & CEPH_CAP_FILE_SHARED) && | 
|---|
| 1874 |  | -			    __ceph_dir_is_complete(ci)) { | 
|---|
|  | 1966 | +			   __ceph_dir_is_complete(ci)) { | 
|---|
| 1875 | 1967 | /* | 
|---|
| 1876 | 1968 | * If a directory is complete, we want to keep | 
|---|
| 1877 | 1969 | * the exclusive cap. So that MDS does not end up | 
|---|
| 1878 | 1970 | * revoking the shared cap on every create/unlink | 
|---|
| 1879 | 1971 | * operation. | 
|---|
| 1880 | 1972 | */ | 
|---|
| 1881 |  | -			want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; | 
|---|
|  | 1973 | +			if (IS_RDONLY(inode)) { | 
|---|
|  | 1974 | +				want = CEPH_CAP_ANY_SHARED; | 
|---|
|  | 1975 | +			} else { | 
|---|
|  | 1976 | +				want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; | 
|---|
|  | 1977 | +			} | 
|---|
| 1882 | 1978 | retain |= want; | 
|---|
| 1883 | 1979 | } else { | 
|---|
| 1884 | 1980 |  | 
|---|
| .. | .. | 
|---|
| 1894 | 1990 | } | 
|---|
| 1895 | 1991 |  | 
|---|
| 1896 | 1992 | dout("check_caps %p file_want %s used %s dirty %s flushing %s" | 
|---|
| 1897 |  | -	     " issued %s revoking %s retain %s %s%s%s\n", inode, | 
|---|
|  | 1993 | +	     " issued %s revoking %s retain %s %s%s\n", inode, | 
|---|
| 1898 | 1994 | ceph_cap_string(file_wanted), | 
|---|
| 1899 | 1995 | ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), | 
|---|
| 1900 | 1996 | ceph_cap_string(ci->i_flushing_caps), | 
|---|
| 1901 | 1997 | ceph_cap_string(issued), ceph_cap_string(revoking), | 
|---|
| 1902 | 1998 | ceph_cap_string(retain), | 
|---|
| 1903 | 1999 | (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", | 
|---|
| 1904 |  | -	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", | 
|---|
| 1905 | 2000 | (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); | 
|---|
| 1906 | 2001 |  | 
|---|
| 1907 | 2002 | /* | 
|---|
| .. | .. | 
|---|
| 1909 | 2004 | * have cached pages, but don't want them, then try to invalidate. | 
|---|
| 1910 | 2005 | * If we fail, it's because pages are locked.... try again later. | 
|---|
| 1911 | 2006 | */ | 
|---|
| 1912 |  | -	if ((!no_delay || mdsc->stopping) && | 
|---|
| 1913 |  | -	    !S_ISDIR(inode->i_mode) &&		/* ignore readdir cache */ | 
|---|
|  | 2007 | +	if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && | 
|---|
|  | 2008 | +	    S_ISREG(inode->i_mode) && | 
|---|
| 1914 | 2009 | !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */ | 
|---|
| 1915 | 2010 | inode->i_data.nrpages &&		/* have cached pages */ | 
|---|
| 1916 | 2011 | (revoking & (CEPH_CAP_FILE_CACHE| | 
|---|
| .. | .. | 
|---|
| 1927 | 2022 | } | 
|---|
| 1928 | 2023 |  | 
|---|
| 1929 | 2024 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 
|---|
|  | 2025 | +		int mflags = 0; | 
|---|
|  | 2026 | +		struct cap_msg_args arg; | 
|---|
|  | 2027 | + | 
|---|
| 1930 | 2028 | cap = rb_entry(p, struct ceph_cap, ci_node); | 
|---|
| 1931 | 2029 |  | 
|---|
| 1932 | 2030 | /* avoid looping forever */ | 
|---|
| .. | .. | 
|---|
| 1936 | 2034 |  | 
|---|
| 1937 | 2035 | /* NOTE: no side-effects allowed, until we take s_mutex */ | 
|---|
| 1938 | 2036 |  | 
|---|
|  | 2037 | +		/* | 
|---|
|  | 2038 | +		 * If we have an auth cap, we don't need to consider any | 
|---|
|  | 2039 | +		 * overlapping caps as used. | 
|---|
|  | 2040 | +		 */ | 
|---|
| 1939 | 2041 | cap_used = used; | 
|---|
| 1940 | 2042 | if (ci->i_auth_cap && cap != ci->i_auth_cap) | 
|---|
| 1941 | 2043 | cap_used &= ~ci->i_auth_cap->issued; | 
|---|
| .. | .. | 
|---|
| 1990 | 2092 | } | 
|---|
| 1991 | 2093 |  | 
|---|
| 1992 | 2094 | /* things we might delay */ | 
|---|
| 1993 |  | -		if ((cap->issued & ~retain) == 0 && | 
|---|
| 1994 |  | -		    cap->mds_wanted == want) | 
|---|
|  | 2095 | +		if ((cap->issued & ~retain) == 0) | 
|---|
| 1995 | 2096 | continue;     /* nope, all good */ | 
|---|
| 1996 | 2097 |  | 
|---|
| 1997 |  | -		if (no_delay) | 
|---|
| 1998 |  | -			goto ack; | 
|---|
| 1999 |  | - | 
|---|
| 2000 |  | -		/* delay? */ | 
|---|
| 2001 |  | -		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && | 
|---|
| 2002 |  | -		    time_before(jiffies, ci->i_hold_caps_max)) { | 
|---|
| 2003 |  | -			dout(" delaying issued %s -> %s, wanted %s -> %s\n", | 
|---|
| 2004 |  | -			     ceph_cap_string(cap->issued), | 
|---|
| 2005 |  | -			     ceph_cap_string(cap->issued & retain), | 
|---|
| 2006 |  | -			     ceph_cap_string(cap->mds_wanted), | 
|---|
| 2007 |  | -			     ceph_cap_string(want)); | 
|---|
| 2008 |  | -			delayed++; | 
|---|
| 2009 |  | -			continue; | 
|---|
| 2010 |  | -		} | 
|---|
| 2011 |  | - | 
|---|
| 2012 | 2098 | ack: | 
|---|
| 2013 |  | -		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { | 
|---|
| 2014 |  | -			dout(" skipping %p I_NOFLUSH set\n", inode); | 
|---|
| 2015 |  | -			continue; | 
|---|
| 2016 |  | -		} | 
|---|
| 2017 |  | - | 
|---|
| 2018 | 2099 | if (session && session != cap->session) { | 
|---|
| 2019 | 2100 | dout("oops, wrong session %p mutex\n", session); | 
|---|
| 2020 | 2101 | mutex_unlock(&session->s_mutex); | 
|---|
| .. | .. | 
|---|
| 2052 | 2133 | if (cap == ci->i_auth_cap && | 
|---|
| 2053 | 2134 | (ci->i_ceph_flags & | 
|---|
| 2054 | 2135 | (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { | 
|---|
| 2055 |  | -			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { | 
|---|
|  | 2136 | +			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) | 
|---|
| 2056 | 2137 | __kick_flushing_caps(mdsc, session, ci, 0); | 
|---|
| 2057 |  | -				ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; | 
|---|
| 2058 |  | -			} | 
|---|
| 2059 | 2138 | if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) | 
|---|
| 2060 | 2139 | __ceph_flush_snaps(ci, session); | 
|---|
| 2061 | 2140 |  | 
|---|
| .. | .. | 
|---|
| 2076 | 2155 | } | 
|---|
| 2077 | 2156 |  | 
|---|
| 2078 | 2157 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) { | 
|---|
| 2079 |  | -			flushing = __mark_caps_flushing(inode, session, false, | 
|---|
| 2080 |  | -							&flush_tid, | 
|---|
| 2081 |  | -							&oldest_flush_tid); | 
|---|
|  | 2158 | +			flushing = ci->i_dirty_caps; | 
|---|
|  | 2159 | +			flush_tid = __mark_caps_flushing(inode, session, false, | 
|---|
|  | 2160 | +							 &oldest_flush_tid); | 
|---|
|  | 2161 | +			if (flags & CHECK_CAPS_FLUSH && | 
|---|
|  | 2162 | +			    list_empty(&session->s_cap_dirty)) | 
|---|
|  | 2163 | +				mflags |= CEPH_CLIENT_CAPS_SYNC; | 
|---|
| 2082 | 2164 | } else { | 
|---|
| 2083 | 2165 | flushing = 0; | 
|---|
| 2084 | 2166 | flush_tid = 0; | 
|---|
| .. | .. | 
|---|
| 2088 | 2170 | } | 
|---|
| 2089 | 2171 |  | 
|---|
| 2090 | 2172 | mds = cap->mds;  /* remember mds, so we don't repeat */ | 
|---|
| 2091 |  | -		sent++; | 
|---|
| 2092 | 2173 |  | 
|---|
| 2093 |  | -		/* __send_cap drops i_ceph_lock */ | 
|---|
| 2094 |  | -		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false, | 
|---|
| 2095 |  | -				cap_used, want, retain, flushing, | 
|---|
| 2096 |  | -				flush_tid, oldest_flush_tid); | 
|---|
|  | 2174 | +		__prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, | 
|---|
|  | 2175 | +			   want, retain, flushing, flush_tid, oldest_flush_tid); | 
|---|
|  | 2176 | +		spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 2177 | + | 
|---|
|  | 2178 | +		__send_cap(&arg, ci); | 
|---|
|  | 2179 | + | 
|---|
| 2097 | 2180 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ | 
|---|
| 2098 | 2181 | } | 
|---|
| 2099 | 2182 |  | 
|---|
| 2100 |  | -	/* Reschedule delayed caps release if we delayed anything */ | 
|---|
| 2101 |  | -	if (delayed) | 
|---|
|  | 2183 | +	/* periodically re-calculate caps wanted by open files */ | 
|---|
|  | 2184 | +	if (__ceph_is_any_real_caps(ci) && | 
|---|
|  | 2185 | +	    list_empty(&ci->i_cap_delay_list) && | 
|---|
|  | 2186 | +	    (file_wanted & ~CEPH_CAP_PIN) && | 
|---|
|  | 2187 | +	    !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { | 
|---|
| 2102 | 2188 | __cap_delay_requeue(mdsc, ci); | 
|---|
|  | 2189 | +	} | 
|---|
| 2103 | 2190 |  | 
|---|
| 2104 | 2191 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| 2105 | 2192 |  | 
|---|
| .. | .. | 
|---|
| 2125 | 2212 |  | 
|---|
| 2126 | 2213 | retry: | 
|---|
| 2127 | 2214 | spin_lock(&ci->i_ceph_lock); | 
|---|
| 2128 |  | -	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { | 
|---|
| 2129 |  | -		spin_unlock(&ci->i_ceph_lock); | 
|---|
| 2130 |  | -		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); | 
|---|
| 2131 |  | -		goto out; | 
|---|
| 2132 |  | -	} | 
|---|
|  | 2215 | +retry_locked: | 
|---|
| 2133 | 2216 | if (ci->i_dirty_caps && ci->i_auth_cap) { | 
|---|
| 2134 | 2217 | struct ceph_cap *cap = ci->i_auth_cap; | 
|---|
| 2135 |  | -		int used = __ceph_caps_used(ci); | 
|---|
| 2136 |  | -		int want = __ceph_caps_wanted(ci); | 
|---|
| 2137 |  | -		int delayed; | 
|---|
|  | 2218 | +		struct cap_msg_args arg; | 
|---|
| 2138 | 2219 |  | 
|---|
| 2139 |  | -		if (!session || session != cap->session) { | 
|---|
|  | 2220 | +		if (session != cap->session) { | 
|---|
| 2140 | 2221 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| 2141 | 2222 | if (session) | 
|---|
| 2142 | 2223 | mutex_unlock(&session->s_mutex); | 
|---|
| .. | .. | 
|---|
| 2149 | 2230 | goto out; | 
|---|
| 2150 | 2231 | } | 
|---|
| 2151 | 2232 |  | 
|---|
| 2152 |  | -		flushing = __mark_caps_flushing(inode, session, true, | 
|---|
| 2153 |  | -						&flush_tid, &oldest_flush_tid); | 
|---|
| 2154 |  | - | 
|---|
| 2155 |  | -		/* __send_cap drops i_ceph_lock */ | 
|---|
| 2156 |  | -		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true, | 
|---|
| 2157 |  | -				used, want, (cap->issued | cap->implemented), | 
|---|
| 2158 |  | -				flushing, flush_tid, oldest_flush_tid); | 
|---|
| 2159 |  | - | 
|---|
| 2160 |  | -		if (delayed) { | 
|---|
| 2161 |  | -			spin_lock(&ci->i_ceph_lock); | 
|---|
| 2162 |  | -			__cap_delay_requeue(mdsc, ci); | 
|---|
| 2163 |  | -			spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 2233 | +		if (ci->i_ceph_flags & | 
|---|
|  | 2234 | +		    (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { | 
|---|
|  | 2235 | +			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) | 
|---|
|  | 2236 | +				__kick_flushing_caps(mdsc, session, ci, 0); | 
|---|
|  | 2237 | +			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) | 
|---|
|  | 2238 | +				__ceph_flush_snaps(ci, session); | 
|---|
|  | 2239 | +			goto retry_locked; | 
|---|
| 2164 | 2240 | } | 
|---|
|  | 2241 | + | 
|---|
|  | 2242 | +		flushing = ci->i_dirty_caps; | 
|---|
|  | 2243 | +		flush_tid = __mark_caps_flushing(inode, session, true, | 
|---|
|  | 2244 | +						 &oldest_flush_tid); | 
|---|
|  | 2245 | + | 
|---|
|  | 2246 | +		__prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, | 
|---|
|  | 2247 | +			   __ceph_caps_used(ci), __ceph_caps_wanted(ci), | 
|---|
|  | 2248 | +			   (cap->issued | cap->implemented), | 
|---|
|  | 2249 | +			   flushing, flush_tid, oldest_flush_tid); | 
|---|
|  | 2250 | +		spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 2251 | + | 
|---|
|  | 2252 | +		__send_cap(&arg, ci); | 
|---|
| 2165 | 2253 | } else { | 
|---|
| 2166 | 2254 | if (!list_empty(&ci->i_cap_flush_list)) { | 
|---|
| 2167 | 2255 | struct ceph_cap_flush *cf = | 
|---|
| .. | .. | 
|---|
| 2206 | 2294 | */ | 
|---|
| 2207 | 2295 | static int unsafe_request_wait(struct inode *inode) | 
|---|
| 2208 | 2296 | { | 
|---|
|  | 2297 | +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 
|---|
| 2209 | 2298 | struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 2210 | 2299 | struct ceph_mds_request *req1 = NULL, *req2 = NULL; | 
|---|
| 2211 | 2300 | int ret, err = 0; | 
|---|
| .. | .. | 
|---|
| 2225 | 2314 | } | 
|---|
| 2226 | 2315 | spin_unlock(&ci->i_unsafe_lock); | 
|---|
| 2227 | 2316 |  | 
|---|
|  | 2317 | +	/* | 
|---|
|  | 2318 | +	 * Trigger to flush the journal logs in all the relevant MDSes | 
|---|
|  | 2319 | +	 * manually, or in the worst case we must wait at most 5 seconds | 
|---|
|  | 2320 | +	 * to wait the journal logs to be flushed by the MDSes periodically. | 
|---|
|  | 2321 | +	 */ | 
|---|
|  | 2322 | +	if (req1 || req2) { | 
|---|
|  | 2323 | +		struct ceph_mds_request *req; | 
|---|
|  | 2324 | +		struct ceph_mds_session **sessions; | 
|---|
|  | 2325 | +		struct ceph_mds_session *s; | 
|---|
|  | 2326 | +		unsigned int max_sessions; | 
|---|
|  | 2327 | +		int i; | 
|---|
|  | 2328 | + | 
|---|
|  | 2329 | +		mutex_lock(&mdsc->mutex); | 
|---|
|  | 2330 | +		max_sessions = mdsc->max_sessions; | 
|---|
|  | 2331 | + | 
|---|
|  | 2332 | +		sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); | 
|---|
|  | 2333 | +		if (!sessions) { | 
|---|
|  | 2334 | +			mutex_unlock(&mdsc->mutex); | 
|---|
|  | 2335 | +			err = -ENOMEM; | 
|---|
|  | 2336 | +			goto out; | 
|---|
|  | 2337 | +		} | 
|---|
|  | 2338 | + | 
|---|
|  | 2339 | +		spin_lock(&ci->i_unsafe_lock); | 
|---|
|  | 2340 | +		if (req1) { | 
|---|
|  | 2341 | +			list_for_each_entry(req, &ci->i_unsafe_dirops, | 
|---|
|  | 2342 | +					    r_unsafe_dir_item) { | 
|---|
|  | 2343 | +				s = req->r_session; | 
|---|
|  | 2344 | +				if (!s) | 
|---|
|  | 2345 | +					continue; | 
|---|
|  | 2346 | +				if (!sessions[s->s_mds]) { | 
|---|
|  | 2347 | +					s = ceph_get_mds_session(s); | 
|---|
|  | 2348 | +					sessions[s->s_mds] = s; | 
|---|
|  | 2349 | +				} | 
|---|
|  | 2350 | +			} | 
|---|
|  | 2351 | +		} | 
|---|
|  | 2352 | +		if (req2) { | 
|---|
|  | 2353 | +			list_for_each_entry(req, &ci->i_unsafe_iops, | 
|---|
|  | 2354 | +					    r_unsafe_target_item) { | 
|---|
|  | 2355 | +				s = req->r_session; | 
|---|
|  | 2356 | +				if (!s) | 
|---|
|  | 2357 | +					continue; | 
|---|
|  | 2358 | +				if (!sessions[s->s_mds]) { | 
|---|
|  | 2359 | +					s = ceph_get_mds_session(s); | 
|---|
|  | 2360 | +					sessions[s->s_mds] = s; | 
|---|
|  | 2361 | +				} | 
|---|
|  | 2362 | +			} | 
|---|
|  | 2363 | +		} | 
|---|
|  | 2364 | +		spin_unlock(&ci->i_unsafe_lock); | 
|---|
|  | 2365 | + | 
|---|
|  | 2366 | +		/* the auth MDS */ | 
|---|
|  | 2367 | +		spin_lock(&ci->i_ceph_lock); | 
|---|
|  | 2368 | +		if (ci->i_auth_cap) { | 
|---|
|  | 2369 | +			s = ci->i_auth_cap->session; | 
|---|
|  | 2370 | +			if (!sessions[s->s_mds]) | 
|---|
|  | 2371 | +				sessions[s->s_mds] = ceph_get_mds_session(s); | 
|---|
|  | 2372 | +		} | 
|---|
|  | 2373 | +		spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 2374 | +		mutex_unlock(&mdsc->mutex); | 
|---|
|  | 2375 | + | 
|---|
|  | 2376 | +		/* send flush mdlog request to MDSes */ | 
|---|
|  | 2377 | +		for (i = 0; i < max_sessions; i++) { | 
|---|
|  | 2378 | +			s = sessions[i]; | 
|---|
|  | 2379 | +			if (s) { | 
|---|
|  | 2380 | +				send_flush_mdlog(s); | 
|---|
|  | 2381 | +				ceph_put_mds_session(s); | 
|---|
|  | 2382 | +			} | 
|---|
|  | 2383 | +		} | 
|---|
|  | 2384 | +		kfree(sessions); | 
|---|
|  | 2385 | +	} | 
|---|
|  | 2386 | + | 
|---|
| 2228 | 2387 | dout("unsafe_request_wait %p wait on tid %llu %llu\n", | 
|---|
| 2229 | 2388 | inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); | 
|---|
| 2230 | 2389 | if (req1) { | 
|---|
| .. | .. | 
|---|
| 2232 | 2391 | ceph_timeout_jiffies(req1->r_timeout)); | 
|---|
| 2233 | 2392 | if (ret) | 
|---|
| 2234 | 2393 | err = -EIO; | 
|---|
| 2235 |  | -		ceph_mdsc_put_request(req1); | 
|---|
| 2236 | 2394 | } | 
|---|
| 2237 | 2395 | if (req2) { | 
|---|
| 2238 | 2396 | ret = !wait_for_completion_timeout(&req2->r_safe_completion, | 
|---|
| 2239 | 2397 | ceph_timeout_jiffies(req2->r_timeout)); | 
|---|
| 2240 | 2398 | if (ret) | 
|---|
| 2241 | 2399 | err = -EIO; | 
|---|
| 2242 |  | -		ceph_mdsc_put_request(req2); | 
|---|
| 2243 | 2400 | } | 
|---|
|  | 2401 | + | 
|---|
|  | 2402 | +out: | 
|---|
|  | 2403 | +	if (req1) | 
|---|
|  | 2404 | +		ceph_mdsc_put_request(req1); | 
|---|
|  | 2405 | +	if (req2) | 
|---|
|  | 2406 | +		ceph_mdsc_put_request(req2); | 
|---|
| 2244 | 2407 | return err; | 
|---|
| 2245 | 2408 | } | 
|---|
| 2246 | 2409 |  | 
|---|
| .. | .. | 
|---|
| 2249 | 2412 | struct inode *inode = file->f_mapping->host; | 
|---|
| 2250 | 2413 | struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 2251 | 2414 | u64 flush_tid; | 
|---|
| 2252 |  | -	int ret; | 
|---|
|  | 2415 | +	int ret, err; | 
|---|
| 2253 | 2416 | int dirty; | 
|---|
| 2254 | 2417 |  | 
|---|
| 2255 | 2418 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); | 
|---|
| 2256 | 2419 |  | 
|---|
| 2257 | 2420 | ret = file_write_and_wait_range(file, start, end); | 
|---|
| 2258 |  | -	if (ret < 0) | 
|---|
| 2259 |  | -		goto out; | 
|---|
| 2260 |  | - | 
|---|
| 2261 | 2421 | if (datasync) | 
|---|
| 2262 | 2422 | goto out; | 
|---|
| 2263 | 2423 |  | 
|---|
| 2264 |  | -	inode_lock(inode); | 
|---|
|  | 2424 | +	ret = ceph_wait_on_async_create(inode); | 
|---|
|  | 2425 | +	if (ret) | 
|---|
|  | 2426 | +		goto out; | 
|---|
| 2265 | 2427 |  | 
|---|
| 2266 | 2428 | dirty = try_flush_caps(inode, &flush_tid); | 
|---|
| 2267 | 2429 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); | 
|---|
| 2268 | 2430 |  | 
|---|
| 2269 |  | -	ret = unsafe_request_wait(inode); | 
|---|
|  | 2431 | +	err = unsafe_request_wait(inode); | 
|---|
| 2270 | 2432 |  | 
|---|
| 2271 | 2433 | /* | 
|---|
| 2272 | 2434 | * only wait on non-file metadata writeback (the mds | 
|---|
| 2273 | 2435 | * can recover size and mtime, so we don't need to | 
|---|
| 2274 | 2436 | * wait for that) | 
|---|
| 2275 | 2437 | */ | 
|---|
| 2276 |  | -	if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { | 
|---|
| 2277 |  | -		ret = wait_event_interruptible(ci->i_cap_wq, | 
|---|
|  | 2438 | +	if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { | 
|---|
|  | 2439 | +		err = wait_event_interruptible(ci->i_cap_wq, | 
|---|
| 2278 | 2440 | caps_are_flushed(inode, flush_tid)); | 
|---|
| 2279 | 2441 | } | 
|---|
| 2280 |  | -	inode_unlock(inode); | 
|---|
|  | 2442 | + | 
|---|
|  | 2443 | +	if (err < 0) | 
|---|
|  | 2444 | +		ret = err; | 
|---|
|  | 2445 | + | 
|---|
|  | 2446 | +	err = file_check_and_advance_wb_err(file); | 
|---|
|  | 2447 | +	if (err < 0) | 
|---|
|  | 2448 | +		ret = err; | 
|---|
| 2281 | 2449 | out: | 
|---|
| 2282 | 2450 | dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); | 
|---|
| 2283 | 2451 | return ret; | 
|---|
| .. | .. | 
|---|
| 2327 | 2495 | struct ceph_cap_flush *cf; | 
|---|
| 2328 | 2496 | int ret; | 
|---|
| 2329 | 2497 | u64 first_tid = 0; | 
|---|
|  | 2498 | +	u64 last_snap_flush = 0; | 
|---|
|  | 2499 | + | 
|---|
|  | 2500 | +	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; | 
|---|
|  | 2501 | + | 
|---|
|  | 2502 | +	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { | 
|---|
|  | 2503 | +		if (cf->is_capsnap) { | 
|---|
|  | 2504 | +			last_snap_flush = cf->tid; | 
|---|
|  | 2505 | +			break; | 
|---|
|  | 2506 | +		} | 
|---|
|  | 2507 | +	} | 
|---|
| 2330 | 2508 |  | 
|---|
| 2331 | 2509 | list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { | 
|---|
| 2332 | 2510 | if (cf->tid < first_tid) | 
|---|
| .. | .. | 
|---|
| 2341 | 2519 |  | 
|---|
| 2342 | 2520 | first_tid = cf->tid + 1; | 
|---|
| 2343 | 2521 |  | 
|---|
| 2344 |  | -		if (cf->caps) { | 
|---|
|  | 2522 | +		if (!cf->is_capsnap) { | 
|---|
|  | 2523 | +			struct cap_msg_args arg; | 
|---|
|  | 2524 | + | 
|---|
| 2345 | 2525 | dout("kick_flushing_caps %p cap %p tid %llu %s\n", | 
|---|
| 2346 | 2526 | inode, cap, cf->tid, ceph_cap_string(cf->caps)); | 
|---|
| 2347 |  | -			ci->i_ceph_flags |= CEPH_I_NODELAY; | 
|---|
| 2348 |  | -			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, | 
|---|
| 2349 |  | -					  false, __ceph_caps_used(ci), | 
|---|
|  | 2527 | +			__prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, | 
|---|
|  | 2528 | +					 (cf->tid < last_snap_flush ? | 
|---|
|  | 2529 | +					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), | 
|---|
|  | 2530 | +					  __ceph_caps_used(ci), | 
|---|
| 2350 | 2531 | __ceph_caps_wanted(ci), | 
|---|
| 2351 |  | -					  cap->issued | cap->implemented, | 
|---|
|  | 2532 | +					  (cap->issued | cap->implemented), | 
|---|
| 2352 | 2533 | cf->caps, cf->tid, oldest_flush_tid); | 
|---|
| 2353 |  | -			if (ret) { | 
|---|
| 2354 |  | -				pr_err("kick_flushing_caps: error sending " | 
|---|
| 2355 |  | -					"cap flush, ino (%llx.%llx) " | 
|---|
| 2356 |  | -					"tid %llu flushing %s\n", | 
|---|
| 2357 |  | -					ceph_vinop(inode), cf->tid, | 
|---|
| 2358 |  | -					ceph_cap_string(cf->caps)); | 
|---|
| 2359 |  | -			} | 
|---|
|  | 2534 | +			spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 2535 | +			__send_cap(&arg, ci); | 
|---|
| 2360 | 2536 | } else { | 
|---|
| 2361 | 2537 | struct ceph_cap_snap *capsnap = | 
|---|
| 2362 | 2538 | container_of(cf, struct ceph_cap_snap, | 
|---|
| .. | .. | 
|---|
| 2417 | 2593 | */ | 
|---|
| 2418 | 2594 | if ((cap->issued & ci->i_flushing_caps) != | 
|---|
| 2419 | 2595 | ci->i_flushing_caps) { | 
|---|
| 2420 |  | -			ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; | 
|---|
|  | 2596 | +			/* encode_caps_cb() also will reset these sequence | 
|---|
|  | 2597 | +			 * numbers. make sure sequence numbers in cap flush | 
|---|
|  | 2598 | +			 * message match later reconnect message */ | 
|---|
|  | 2599 | +			cap->seq = 0; | 
|---|
|  | 2600 | +			cap->issue_seq = 0; | 
|---|
|  | 2601 | +			cap->mseq = 0; | 
|---|
| 2421 | 2602 | __kick_flushing_caps(mdsc, session, ci, | 
|---|
| 2422 | 2603 | oldest_flush_tid); | 
|---|
| 2423 | 2604 | } else { | 
|---|
| .. | .. | 
|---|
| 2435 | 2616 | struct ceph_cap *cap; | 
|---|
| 2436 | 2617 | u64 oldest_flush_tid; | 
|---|
| 2437 | 2618 |  | 
|---|
|  | 2619 | +	lockdep_assert_held(&session->s_mutex); | 
|---|
|  | 2620 | + | 
|---|
| 2438 | 2621 | dout("kick_flushing_caps mds%d\n", session->s_mds); | 
|---|
| 2439 | 2622 |  | 
|---|
| 2440 | 2623 | spin_lock(&mdsc->cap_dirty_lock); | 
|---|
| .. | .. | 
|---|
| 2451 | 2634 | continue; | 
|---|
| 2452 | 2635 | } | 
|---|
| 2453 | 2636 | if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { | 
|---|
| 2454 |  | -			ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; | 
|---|
| 2455 | 2637 | __kick_flushing_caps(mdsc, session, ci, | 
|---|
| 2456 | 2638 | oldest_flush_tid); | 
|---|
| 2457 | 2639 | } | 
|---|
| .. | .. | 
|---|
| 2459 | 2641 | } | 
|---|
| 2460 | 2642 | } | 
|---|
| 2461 | 2643 |  | 
|---|
| 2462 |  | -static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | 
|---|
| 2463 |  | -				     struct ceph_mds_session *session, | 
|---|
| 2464 |  | -				     struct inode *inode) | 
|---|
| 2465 |  | -	__releases(ci->i_ceph_lock) | 
|---|
|  | 2644 | +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, | 
|---|
|  | 2645 | +				   struct ceph_inode_info *ci) | 
|---|
| 2466 | 2646 | { | 
|---|
| 2467 |  | -	struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 2468 |  | -	struct ceph_cap *cap; | 
|---|
|  | 2647 | +	struct ceph_mds_client *mdsc = session->s_mdsc; | 
|---|
|  | 2648 | +	struct ceph_cap *cap = ci->i_auth_cap; | 
|---|
| 2469 | 2649 |  | 
|---|
| 2470 |  | -	cap = ci->i_auth_cap; | 
|---|
| 2471 |  | -	dout("kick_flushing_inode_caps %p flushing %s\n", inode, | 
|---|
|  | 2650 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
|  | 2651 | + | 
|---|
|  | 2652 | +	dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, | 
|---|
| 2472 | 2653 | ceph_cap_string(ci->i_flushing_caps)); | 
|---|
| 2473 | 2654 |  | 
|---|
| 2474 | 2655 | if (!list_empty(&ci->i_cap_flush_list)) { | 
|---|
| .. | .. | 
|---|
| 2479 | 2660 | oldest_flush_tid = __get_oldest_flush_tid(mdsc); | 
|---|
| 2480 | 2661 | spin_unlock(&mdsc->cap_dirty_lock); | 
|---|
| 2481 | 2662 |  | 
|---|
| 2482 |  | -		ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; | 
|---|
| 2483 | 2663 | __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); | 
|---|
| 2484 |  | -		spin_unlock(&ci->i_ceph_lock); | 
|---|
| 2485 |  | -	} else { | 
|---|
| 2486 |  | -		spin_unlock(&ci->i_ceph_lock); | 
|---|
| 2487 | 2664 | } | 
|---|
| 2488 | 2665 | } | 
|---|
| 2489 | 2666 |  | 
|---|
| .. | .. | 
|---|
| 2491 | 2668 | /* | 
|---|
| 2492 | 2669 | * Take references to capabilities we hold, so that we don't release | 
|---|
| 2493 | 2670 | * them to the MDS prematurely. | 
|---|
| 2494 |  | - * | 
|---|
| 2495 |  | - * Protected by i_ceph_lock. | 
|---|
| 2496 | 2671 | */ | 
|---|
| 2497 |  | -static void __take_cap_refs(struct ceph_inode_info *ci, int got, | 
|---|
|  | 2672 | +void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, | 
|---|
| 2498 | 2673 | bool snap_rwsem_locked) | 
|---|
| 2499 | 2674 | { | 
|---|
|  | 2675 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
|  | 2676 | + | 
|---|
| 2500 | 2677 | if (got & CEPH_CAP_PIN) | 
|---|
| 2501 | 2678 | ci->i_pin_ref++; | 
|---|
| 2502 | 2679 | if (got & CEPH_CAP_FILE_RD) | 
|---|
| 2503 | 2680 | ci->i_rd_ref++; | 
|---|
| 2504 | 2681 | if (got & CEPH_CAP_FILE_CACHE) | 
|---|
| 2505 | 2682 | ci->i_rdcache_ref++; | 
|---|
|  | 2683 | +	if (got & CEPH_CAP_FILE_EXCL) | 
|---|
|  | 2684 | +		ci->i_fx_ref++; | 
|---|
| 2506 | 2685 | if (got & CEPH_CAP_FILE_WR) { | 
|---|
| 2507 | 2686 | if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { | 
|---|
| 2508 | 2687 | BUG_ON(!snap_rwsem_locked); | 
|---|
| .. | .. | 
|---|
| 2515 | 2694 | if (ci->i_wb_ref == 0) | 
|---|
| 2516 | 2695 | ihold(&ci->vfs_inode); | 
|---|
| 2517 | 2696 | ci->i_wb_ref++; | 
|---|
| 2518 |  | -		dout("__take_cap_refs %p wb %d -> %d (?)\n", | 
|---|
|  | 2697 | +		dout("%s %p wb %d -> %d (?)\n", __func__, | 
|---|
| 2519 | 2698 | &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); | 
|---|
| 2520 | 2699 | } | 
|---|
| 2521 | 2700 | } | 
|---|
| .. | .. | 
|---|
| 2526 | 2705 | * to (when applicable), and check against max_size here as well. | 
|---|
| 2527 | 2706 | * Note that caller is responsible for ensuring max_size increases are | 
|---|
| 2528 | 2707 | * requested from the MDS. | 
|---|
|  | 2708 | + * | 
|---|
|  | 2709 | + * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, | 
|---|
|  | 2710 | + * or a negative error code. There are 3 speical error codes: | 
|---|
|  | 2711 | + *  -EAGAIN: need to sleep but non-blocking is specified | 
|---|
|  | 2712 | + *  -EFBIG:  ask caller to call check_max_size() and try again. | 
|---|
|  | 2713 | + *  -ESTALE: ask caller to call ceph_renew_caps() and try again. | 
|---|
| 2529 | 2714 | */ | 
|---|
| 2530 |  | -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | 
|---|
| 2531 |  | -			    loff_t endoff, bool nonblock, int *got, int *err) | 
|---|
|  | 2715 | +enum { | 
|---|
|  | 2716 | +	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ | 
|---|
|  | 2717 | +	NON_BLOCKING	= (1 << 8), | 
|---|
|  | 2718 | +	CHECK_FILELOCK	= (1 << 9), | 
|---|
|  | 2719 | +}; | 
|---|
|  | 2720 | + | 
|---|
|  | 2721 | +static int try_get_cap_refs(struct inode *inode, int need, int want, | 
|---|
|  | 2722 | +			    loff_t endoff, int flags, int *got) | 
|---|
| 2532 | 2723 | { | 
|---|
| 2533 |  | -	struct inode *inode = &ci->vfs_inode; | 
|---|
|  | 2724 | +	struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 2534 | 2725 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 
|---|
| 2535 | 2726 | int ret = 0; | 
|---|
| 2536 | 2727 | int have, implemented; | 
|---|
| 2537 |  | -	int file_wanted; | 
|---|
| 2538 | 2728 | bool snap_rwsem_locked = false; | 
|---|
| 2539 | 2729 |  | 
|---|
| 2540 | 2730 | dout("get_cap_refs %p need %s want %s\n", inode, | 
|---|
| .. | .. | 
|---|
| 2543 | 2733 | again: | 
|---|
| 2544 | 2734 | spin_lock(&ci->i_ceph_lock); | 
|---|
| 2545 | 2735 |  | 
|---|
| 2546 |  | -	/* make sure file is actually open */ | 
|---|
| 2547 |  | -	file_wanted = __ceph_caps_file_wanted(ci); | 
|---|
| 2548 |  | -	if ((file_wanted & need) != need) { | 
|---|
| 2549 |  | -		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", | 
|---|
| 2550 |  | -		     ceph_cap_string(need), ceph_cap_string(file_wanted)); | 
|---|
| 2551 |  | -		*err = -EBADF; | 
|---|
| 2552 |  | -		ret = 1; | 
|---|
|  | 2736 | +	if ((flags & CHECK_FILELOCK) && | 
|---|
|  | 2737 | +	    (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { | 
|---|
|  | 2738 | +		dout("try_get_cap_refs %p error filelock\n", inode); | 
|---|
|  | 2739 | +		ret = -EIO; | 
|---|
| 2553 | 2740 | goto out_unlock; | 
|---|
| 2554 | 2741 | } | 
|---|
| 2555 | 2742 |  | 
|---|
| .. | .. | 
|---|
| 2570 | 2757 | if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { | 
|---|
| 2571 | 2758 | dout("get_cap_refs %p endoff %llu > maxsize %llu\n", | 
|---|
| 2572 | 2759 | inode, endoff, ci->i_max_size); | 
|---|
| 2573 |  | -			if (endoff > ci->i_requested_max_size) { | 
|---|
| 2574 |  | -				*err = -EAGAIN; | 
|---|
| 2575 |  | -				ret = 1; | 
|---|
| 2576 |  | -			} | 
|---|
|  | 2760 | +			if (endoff > ci->i_requested_max_size) | 
|---|
|  | 2761 | +				ret = ci->i_auth_cap ? -EFBIG : -ESTALE; | 
|---|
| 2577 | 2762 | goto out_unlock; | 
|---|
| 2578 | 2763 | } | 
|---|
| 2579 | 2764 | /* | 
|---|
| .. | .. | 
|---|
| 2607 | 2792 | * we can not call down_read() when | 
|---|
| 2608 | 2793 | * task isn't in TASK_RUNNING state | 
|---|
| 2609 | 2794 | */ | 
|---|
| 2610 |  | -					if (nonblock) { | 
|---|
| 2611 |  | -						*err = -EAGAIN; | 
|---|
| 2612 |  | -						ret = 1; | 
|---|
|  | 2795 | +					if (flags & NON_BLOCKING) { | 
|---|
|  | 2796 | +						ret = -EAGAIN; | 
|---|
| 2613 | 2797 | goto out_unlock; | 
|---|
| 2614 | 2798 | } | 
|---|
| 2615 | 2799 |  | 
|---|
| .. | .. | 
|---|
| 2620 | 2804 | } | 
|---|
| 2621 | 2805 | snap_rwsem_locked = true; | 
|---|
| 2622 | 2806 | } | 
|---|
| 2623 |  | -			*got = need | (have & want); | 
|---|
| 2624 |  | -			if ((need & CEPH_CAP_FILE_RD) && | 
|---|
|  | 2807 | +			if ((have & want) == want) | 
|---|
|  | 2808 | +				*got = need | want; | 
|---|
|  | 2809 | +			else | 
|---|
|  | 2810 | +				*got = need; | 
|---|
|  | 2811 | +			if (S_ISREG(inode->i_mode) && | 
|---|
|  | 2812 | +			    (need & CEPH_CAP_FILE_RD) && | 
|---|
| 2625 | 2813 | !(*got & CEPH_CAP_FILE_CACHE)) | 
|---|
| 2626 | 2814 | ceph_disable_fscache_readpage(ci); | 
|---|
| 2627 |  | -			__take_cap_refs(ci, *got, true); | 
|---|
|  | 2815 | +			ceph_take_cap_refs(ci, *got, true); | 
|---|
| 2628 | 2816 | ret = 1; | 
|---|
| 2629 | 2817 | } | 
|---|
| 2630 | 2818 | } else { | 
|---|
| 2631 | 2819 | int session_readonly = false; | 
|---|
| 2632 |  | -		if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { | 
|---|
|  | 2820 | +		int mds_wanted; | 
|---|
|  | 2821 | +		if (ci->i_auth_cap && | 
|---|
|  | 2822 | +		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { | 
|---|
| 2633 | 2823 | struct ceph_mds_session *s = ci->i_auth_cap->session; | 
|---|
| 2634 | 2824 | spin_lock(&s->s_cap_lock); | 
|---|
| 2635 | 2825 | session_readonly = s->s_readonly; | 
|---|
| 2636 | 2826 | spin_unlock(&s->s_cap_lock); | 
|---|
| 2637 | 2827 | } | 
|---|
| 2638 | 2828 | if (session_readonly) { | 
|---|
| 2639 |  | -			dout("get_cap_refs %p needed %s but mds%d readonly\n", | 
|---|
|  | 2829 | +			dout("get_cap_refs %p need %s but mds%d readonly\n", | 
|---|
| 2640 | 2830 | inode, ceph_cap_string(need), ci->i_auth_cap->mds); | 
|---|
| 2641 |  | -			*err = -EROFS; | 
|---|
| 2642 |  | -			ret = 1; | 
|---|
|  | 2831 | +			ret = -EROFS; | 
|---|
| 2643 | 2832 | goto out_unlock; | 
|---|
| 2644 | 2833 | } | 
|---|
| 2645 | 2834 |  | 
|---|
| 2646 |  | -		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { | 
|---|
| 2647 |  | -			int mds_wanted; | 
|---|
| 2648 |  | -			if (READ_ONCE(mdsc->fsc->mount_state) == | 
|---|
| 2649 |  | -			    CEPH_MOUNT_SHUTDOWN) { | 
|---|
| 2650 |  | -				dout("get_cap_refs %p forced umount\n", inode); | 
|---|
| 2651 |  | -				*err = -EIO; | 
|---|
| 2652 |  | -				ret = 1; | 
|---|
| 2653 |  | -				goto out_unlock; | 
|---|
| 2654 |  | -			} | 
|---|
| 2655 |  | -			mds_wanted = __ceph_caps_mds_wanted(ci, false); | 
|---|
| 2656 |  | -			if (need & ~(mds_wanted & need)) { | 
|---|
| 2657 |  | -				dout("get_cap_refs %p caps were dropped" | 
|---|
| 2658 |  | -				     " (session killed?)\n", inode); | 
|---|
| 2659 |  | -				*err = -ESTALE; | 
|---|
| 2660 |  | -				ret = 1; | 
|---|
| 2661 |  | -				goto out_unlock; | 
|---|
| 2662 |  | -			} | 
|---|
| 2663 |  | -			if (!(file_wanted & ~mds_wanted)) | 
|---|
| 2664 |  | -				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; | 
|---|
|  | 2835 | +		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | 
|---|
|  | 2836 | +			dout("get_cap_refs %p forced umount\n", inode); | 
|---|
|  | 2837 | +			ret = -EIO; | 
|---|
|  | 2838 | +			goto out_unlock; | 
|---|
|  | 2839 | +		} | 
|---|
|  | 2840 | +		mds_wanted = __ceph_caps_mds_wanted(ci, false); | 
|---|
|  | 2841 | +		if (need & ~mds_wanted) { | 
|---|
|  | 2842 | +			dout("get_cap_refs %p need %s > mds_wanted %s\n", | 
|---|
|  | 2843 | +			     inode, ceph_cap_string(need), | 
|---|
|  | 2844 | +			     ceph_cap_string(mds_wanted)); | 
|---|
|  | 2845 | +			ret = -ESTALE; | 
|---|
|  | 2846 | +			goto out_unlock; | 
|---|
| 2665 | 2847 | } | 
|---|
| 2666 | 2848 |  | 
|---|
| 2667 |  | -		dout("get_cap_refs %p have %s needed %s\n", inode, | 
|---|
|  | 2849 | +		dout("get_cap_refs %p have %s need %s\n", inode, | 
|---|
| 2668 | 2850 | ceph_cap_string(have), ceph_cap_string(need)); | 
|---|
| 2669 | 2851 | } | 
|---|
| 2670 | 2852 | out_unlock: | 
|---|
|  | 2853 | + | 
|---|
|  | 2854 | +	__ceph_touch_fmode(ci, mdsc, flags); | 
|---|
|  | 2855 | + | 
|---|
| 2671 | 2856 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| 2672 | 2857 | if (snap_rwsem_locked) | 
|---|
| 2673 | 2858 | up_read(&mdsc->snap_rwsem); | 
|---|
|  | 2859 | + | 
|---|
|  | 2860 | +	if (!ret) | 
|---|
|  | 2861 | +		ceph_update_cap_mis(&mdsc->metric); | 
|---|
|  | 2862 | +	else if (ret == 1) | 
|---|
|  | 2863 | +		ceph_update_cap_hit(&mdsc->metric); | 
|---|
| 2674 | 2864 |  | 
|---|
| 2675 | 2865 | dout("get_cap_refs %p ret %d got %s\n", inode, | 
|---|
| 2676 | 2866 | ret, ceph_cap_string(*got)); | 
|---|
| .. | .. | 
|---|
| 2705 | 2895 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 
|---|
| 2706 | 2896 | } | 
|---|
| 2707 | 2897 |  | 
|---|
| 2708 |  | -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) | 
|---|
|  | 2898 | +static inline int get_used_fmode(int caps) | 
|---|
| 2709 | 2899 | { | 
|---|
| 2710 |  | -	int ret, err = 0; | 
|---|
|  | 2900 | +	int fmode = 0; | 
|---|
|  | 2901 | +	if (caps & CEPH_CAP_FILE_RD) | 
|---|
|  | 2902 | +		fmode |= CEPH_FILE_MODE_RD; | 
|---|
|  | 2903 | +	if (caps & CEPH_CAP_FILE_WR) | 
|---|
|  | 2904 | +		fmode |= CEPH_FILE_MODE_WR; | 
|---|
|  | 2905 | +	return fmode; | 
|---|
|  | 2906 | +} | 
|---|
|  | 2907 | + | 
|---|
|  | 2908 | +int ceph_try_get_caps(struct inode *inode, int need, int want, | 
|---|
|  | 2909 | +		      bool nonblock, int *got) | 
|---|
|  | 2910 | +{ | 
|---|
|  | 2911 | +	int ret, flags; | 
|---|
| 2711 | 2912 |  | 
|---|
| 2712 | 2913 | BUG_ON(need & ~CEPH_CAP_FILE_RD); | 
|---|
| 2713 |  | -	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); | 
|---|
| 2714 |  | -	ret = ceph_pool_perm_check(ci, need); | 
|---|
| 2715 |  | -	if (ret < 0) | 
|---|
| 2716 |  | -		return ret; | 
|---|
| 2717 |  | - | 
|---|
| 2718 |  | -	ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); | 
|---|
| 2719 |  | -	if (ret) { | 
|---|
| 2720 |  | -		if (err == -EAGAIN) { | 
|---|
| 2721 |  | -			ret = 0; | 
|---|
| 2722 |  | -		} else if (err < 0) { | 
|---|
| 2723 |  | -			ret = err; | 
|---|
| 2724 |  | -		} | 
|---|
|  | 2914 | +	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | | 
|---|
|  | 2915 | +			CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | | 
|---|
|  | 2916 | +			CEPH_CAP_ANY_DIR_OPS)); | 
|---|
|  | 2917 | +	if (need) { | 
|---|
|  | 2918 | +		ret = ceph_pool_perm_check(inode, need); | 
|---|
|  | 2919 | +		if (ret < 0) | 
|---|
|  | 2920 | +			return ret; | 
|---|
| 2725 | 2921 | } | 
|---|
|  | 2922 | + | 
|---|
|  | 2923 | +	flags = get_used_fmode(need | want); | 
|---|
|  | 2924 | +	if (nonblock) | 
|---|
|  | 2925 | +		flags |= NON_BLOCKING; | 
|---|
|  | 2926 | + | 
|---|
|  | 2927 | +	ret = try_get_cap_refs(inode, need, want, 0, flags, got); | 
|---|
|  | 2928 | +	/* three special error codes */ | 
|---|
|  | 2929 | +	if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) | 
|---|
|  | 2930 | +		ret = 0; | 
|---|
| 2726 | 2931 | return ret; | 
|---|
| 2727 | 2932 | } | 
|---|
| 2728 | 2933 |  | 
|---|
| .. | .. | 
|---|
| 2731 | 2936 | * due to a small max_size, make sure we check_max_size (and possibly | 
|---|
| 2732 | 2937 | * ask the mds) so we don't get hung up indefinitely. | 
|---|
| 2733 | 2938 | */ | 
|---|
| 2734 |  | -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | 
|---|
|  | 2939 | +int ceph_get_caps(struct file *filp, int need, int want, | 
|---|
| 2735 | 2940 | loff_t endoff, int *got, struct page **pinned_page) | 
|---|
| 2736 | 2941 | { | 
|---|
| 2737 |  | -	int _got, ret, err = 0; | 
|---|
|  | 2942 | +	struct ceph_file_info *fi = filp->private_data; | 
|---|
|  | 2943 | +	struct inode *inode = file_inode(filp); | 
|---|
|  | 2944 | +	struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
|  | 2945 | +	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 
|---|
|  | 2946 | +	int ret, _got, flags; | 
|---|
| 2738 | 2947 |  | 
|---|
| 2739 |  | -	ret = ceph_pool_perm_check(ci, need); | 
|---|
|  | 2948 | +	ret = ceph_pool_perm_check(inode, need); | 
|---|
| 2740 | 2949 | if (ret < 0) | 
|---|
| 2741 | 2950 | return ret; | 
|---|
| 2742 | 2951 |  | 
|---|
| 2743 |  | -	while (true) { | 
|---|
| 2744 |  | -		if (endoff > 0) | 
|---|
| 2745 |  | -			check_max_size(&ci->vfs_inode, endoff); | 
|---|
|  | 2952 | +	if ((fi->fmode & CEPH_FILE_MODE_WR) && | 
|---|
|  | 2953 | +	    fi->filp_gen != READ_ONCE(fsc->filp_gen)) | 
|---|
|  | 2954 | +		return -EBADF; | 
|---|
| 2746 | 2955 |  | 
|---|
| 2747 |  | -		err = 0; | 
|---|
|  | 2956 | +	flags = get_used_fmode(need | want); | 
|---|
|  | 2957 | + | 
|---|
|  | 2958 | +	while (true) { | 
|---|
|  | 2959 | +		flags &= CEPH_FILE_MODE_MASK; | 
|---|
|  | 2960 | +		if (atomic_read(&fi->num_locks)) | 
|---|
|  | 2961 | +			flags |= CHECK_FILELOCK; | 
|---|
| 2748 | 2962 | _got = 0; | 
|---|
| 2749 |  | -		ret = try_get_cap_refs(ci, need, want, endoff, | 
|---|
| 2750 |  | -				       false, &_got, &err); | 
|---|
| 2751 |  | -		if (ret) { | 
|---|
| 2752 |  | -			if (err == -EAGAIN) | 
|---|
| 2753 |  | -				continue; | 
|---|
| 2754 |  | -			if (err < 0) | 
|---|
| 2755 |  | -				ret = err; | 
|---|
| 2756 |  | -		} else { | 
|---|
|  | 2963 | +		ret = try_get_cap_refs(inode, need, want, endoff, | 
|---|
|  | 2964 | +				       flags, &_got); | 
|---|
|  | 2965 | +		WARN_ON_ONCE(ret == -EAGAIN); | 
|---|
|  | 2966 | +		if (!ret) { | 
|---|
|  | 2967 | +			struct ceph_mds_client *mdsc = fsc->mdsc; | 
|---|
|  | 2968 | +			struct cap_wait cw; | 
|---|
| 2757 | 2969 | DEFINE_WAIT_FUNC(wait, woken_wake_function); | 
|---|
|  | 2970 | + | 
|---|
|  | 2971 | +			cw.ino = ceph_ino(inode); | 
|---|
|  | 2972 | +			cw.tgid = current->tgid; | 
|---|
|  | 2973 | +			cw.need = need; | 
|---|
|  | 2974 | +			cw.want = want; | 
|---|
|  | 2975 | + | 
|---|
|  | 2976 | +			spin_lock(&mdsc->caps_list_lock); | 
|---|
|  | 2977 | +			list_add(&cw.list, &mdsc->cap_wait_list); | 
|---|
|  | 2978 | +			spin_unlock(&mdsc->caps_list_lock); | 
|---|
|  | 2979 | + | 
|---|
|  | 2980 | +			/* make sure used fmode not timeout */ | 
|---|
|  | 2981 | +			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); | 
|---|
| 2758 | 2982 | add_wait_queue(&ci->i_cap_wq, &wait); | 
|---|
| 2759 | 2983 |  | 
|---|
| 2760 |  | -			while (!try_get_cap_refs(ci, need, want, endoff, | 
|---|
| 2761 |  | -						 true, &_got, &err)) { | 
|---|
|  | 2984 | +			flags |= NON_BLOCKING; | 
|---|
|  | 2985 | +			while (!(ret = try_get_cap_refs(inode, need, want, | 
|---|
|  | 2986 | +							endoff, flags, &_got))) { | 
|---|
| 2762 | 2987 | if (signal_pending(current)) { | 
|---|
| 2763 | 2988 | ret = -ERESTARTSYS; | 
|---|
| 2764 | 2989 | break; | 
|---|
| .. | .. | 
|---|
| 2767 | 2992 | } | 
|---|
| 2768 | 2993 |  | 
|---|
| 2769 | 2994 | remove_wait_queue(&ci->i_cap_wq, &wait); | 
|---|
|  | 2995 | +			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); | 
|---|
| 2770 | 2996 |  | 
|---|
| 2771 |  | -			if (err == -EAGAIN) | 
|---|
|  | 2997 | +			spin_lock(&mdsc->caps_list_lock); | 
|---|
|  | 2998 | +			list_del(&cw.list); | 
|---|
|  | 2999 | +			spin_unlock(&mdsc->caps_list_lock); | 
|---|
|  | 3000 | + | 
|---|
|  | 3001 | +			if (ret == -EAGAIN) | 
|---|
| 2772 | 3002 | continue; | 
|---|
| 2773 |  | -			if (err < 0) | 
|---|
| 2774 |  | -				ret = err; | 
|---|
| 2775 | 3003 | } | 
|---|
|  | 3004 | + | 
|---|
|  | 3005 | +		if ((fi->fmode & CEPH_FILE_MODE_WR) && | 
|---|
|  | 3006 | +		    fi->filp_gen != READ_ONCE(fsc->filp_gen)) { | 
|---|
|  | 3007 | +			if (ret >= 0 && _got) | 
|---|
|  | 3008 | +				ceph_put_cap_refs(ci, _got); | 
|---|
|  | 3009 | +			return -EBADF; | 
|---|
|  | 3010 | +		} | 
|---|
|  | 3011 | + | 
|---|
| 2776 | 3012 | if (ret < 0) { | 
|---|
| 2777 |  | -			if (err == -ESTALE) { | 
|---|
|  | 3013 | +			if (ret == -EFBIG || ret == -ESTALE) { | 
|---|
|  | 3014 | +				int ret2 = ceph_wait_on_async_create(inode); | 
|---|
|  | 3015 | +				if (ret2 < 0) | 
|---|
|  | 3016 | +					return ret2; | 
|---|
|  | 3017 | +			} | 
|---|
|  | 3018 | +			if (ret == -EFBIG) { | 
|---|
|  | 3019 | +				check_max_size(inode, endoff); | 
|---|
|  | 3020 | +				continue; | 
|---|
|  | 3021 | +			} | 
|---|
|  | 3022 | +			if (ret == -ESTALE) { | 
|---|
| 2778 | 3023 | /* session was killed, try renew caps */ | 
|---|
| 2779 |  | -				ret = ceph_renew_caps(&ci->vfs_inode); | 
|---|
|  | 3024 | +				ret = ceph_renew_caps(inode, flags); | 
|---|
| 2780 | 3025 | if (ret == 0) | 
|---|
| 2781 | 3026 | continue; | 
|---|
| 2782 | 3027 | } | 
|---|
| 2783 | 3028 | return ret; | 
|---|
| 2784 | 3029 | } | 
|---|
| 2785 | 3030 |  | 
|---|
| 2786 |  | -		if (ci->i_inline_version != CEPH_INLINE_NONE && | 
|---|
|  | 3031 | +		if (S_ISREG(ci->vfs_inode.i_mode) && | 
|---|
|  | 3032 | +		    ci->i_inline_version != CEPH_INLINE_NONE && | 
|---|
| 2787 | 3033 | (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && | 
|---|
| 2788 |  | -		    i_size_read(&ci->vfs_inode) > 0) { | 
|---|
|  | 3034 | +		    i_size_read(inode) > 0) { | 
|---|
| 2789 | 3035 | struct page *page = | 
|---|
| 2790 |  | -				find_get_page(ci->vfs_inode.i_mapping, 0); | 
|---|
|  | 3036 | +				find_get_page(inode->i_mapping, 0); | 
|---|
| 2791 | 3037 | if (page) { | 
|---|
| 2792 | 3038 | if (PageUptodate(page)) { | 
|---|
| 2793 | 3039 | *pinned_page = page; | 
|---|
| .. | .. | 
|---|
| 2806 | 3052 | * getattr request will bring inline data into | 
|---|
| 2807 | 3053 | * page cache | 
|---|
| 2808 | 3054 | */ | 
|---|
| 2809 |  | -			ret = __ceph_do_getattr(&ci->vfs_inode, NULL, | 
|---|
|  | 3055 | +			ret = __ceph_do_getattr(inode, NULL, | 
|---|
| 2810 | 3056 | CEPH_STAT_CAP_INLINE_DATA, | 
|---|
| 2811 | 3057 | true); | 
|---|
| 2812 | 3058 | if (ret < 0) | 
|---|
| .. | .. | 
|---|
| 2816 | 3062 | break; | 
|---|
| 2817 | 3063 | } | 
|---|
| 2818 | 3064 |  | 
|---|
| 2819 |  | -	if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) | 
|---|
|  | 3065 | +	if (S_ISREG(ci->vfs_inode.i_mode) && | 
|---|
|  | 3066 | +	    (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) | 
|---|
| 2820 | 3067 | ceph_fscache_revalidate_cookie(ci); | 
|---|
| 2821 | 3068 |  | 
|---|
| 2822 | 3069 | *got = _got; | 
|---|
| .. | .. | 
|---|
| 2830 | 3077 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) | 
|---|
| 2831 | 3078 | { | 
|---|
| 2832 | 3079 | spin_lock(&ci->i_ceph_lock); | 
|---|
| 2833 |  | -	__take_cap_refs(ci, caps, false); | 
|---|
|  | 3080 | +	ceph_take_cap_refs(ci, caps, false); | 
|---|
| 2834 | 3081 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| 2835 | 3082 | } | 
|---|
| 2836 | 3083 |  | 
|---|
| .. | .. | 
|---|
| 2867 | 3114 | * If we are releasing a WR cap (from a sync write), finalize any affected | 
|---|
| 2868 | 3115 | * cap_snap, and wake up any waiters. | 
|---|
| 2869 | 3116 | */ | 
|---|
| 2870 |  | -void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | 
|---|
|  | 3117 | +static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, | 
|---|
|  | 3118 | +				bool skip_checking_caps) | 
|---|
| 2871 | 3119 | { | 
|---|
| 2872 | 3120 | struct inode *inode = &ci->vfs_inode; | 
|---|
| 2873 | 3121 | int last = 0, put = 0, flushsnaps = 0, wake = 0; | 
|---|
| .. | .. | 
|---|
| 2880 | 3128 | last++; | 
|---|
| 2881 | 3129 | if (had & CEPH_CAP_FILE_CACHE) | 
|---|
| 2882 | 3130 | if (--ci->i_rdcache_ref == 0) | 
|---|
|  | 3131 | +			last++; | 
|---|
|  | 3132 | +	if (had & CEPH_CAP_FILE_EXCL) | 
|---|
|  | 3133 | +		if (--ci->i_fx_ref == 0) | 
|---|
| 2883 | 3134 | last++; | 
|---|
| 2884 | 3135 | if (had & CEPH_CAP_FILE_BUFFER) { | 
|---|
| 2885 | 3136 | if (--ci->i_wb_ref == 0) { | 
|---|
| .. | .. | 
|---|
| 2912 | 3163 | ci->i_head_snapc = NULL; | 
|---|
| 2913 | 3164 | } | 
|---|
| 2914 | 3165 | /* see comment in __ceph_remove_cap() */ | 
|---|
| 2915 |  | -			if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) | 
|---|
|  | 3166 | +			if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) | 
|---|
| 2916 | 3167 | drop_inode_snap_realm(ci); | 
|---|
| 2917 | 3168 | } | 
|---|
| 2918 | 3169 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| .. | .. | 
|---|
| 2920 | 3171 | dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), | 
|---|
| 2921 | 3172 | last ? " last" : "", put ? " put" : ""); | 
|---|
| 2922 | 3173 |  | 
|---|
| 2923 |  | -	if (last && !flushsnaps) | 
|---|
| 2924 |  | -		ceph_check_caps(ci, 0, NULL); | 
|---|
| 2925 |  | -	else if (flushsnaps) | 
|---|
| 2926 |  | -		ceph_flush_snaps(ci, NULL); | 
|---|
|  | 3174 | +	if (!skip_checking_caps) { | 
|---|
|  | 3175 | +		if (last) | 
|---|
|  | 3176 | +			ceph_check_caps(ci, 0, NULL); | 
|---|
|  | 3177 | +		else if (flushsnaps) | 
|---|
|  | 3178 | +			ceph_flush_snaps(ci, NULL); | 
|---|
|  | 3179 | +	} | 
|---|
| 2927 | 3180 | if (wake) | 
|---|
| 2928 | 3181 | wake_up_all(&ci->i_cap_wq); | 
|---|
| 2929 | 3182 | while (put-- > 0) | 
|---|
| 2930 | 3183 | iput(inode); | 
|---|
|  | 3184 | +} | 
|---|
|  | 3185 | + | 
|---|
|  | 3186 | +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | 
|---|
|  | 3187 | +{ | 
|---|
|  | 3188 | +	__ceph_put_cap_refs(ci, had, false); | 
|---|
|  | 3189 | +} | 
|---|
|  | 3190 | + | 
|---|
|  | 3191 | +void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) | 
|---|
|  | 3192 | +{ | 
|---|
|  | 3193 | +	__ceph_put_cap_refs(ci, had, true); | 
|---|
| 2931 | 3194 | } | 
|---|
| 2932 | 3195 |  | 
|---|
| 2933 | 3196 | /* | 
|---|
| .. | .. | 
|---|
| 2977 | 3240 | break; | 
|---|
| 2978 | 3241 | } | 
|---|
| 2979 | 3242 | } | 
|---|
| 2980 |  | -		BUG_ON(!found); | 
|---|
|  | 3243 | + | 
|---|
|  | 3244 | +		if (!found) { | 
|---|
|  | 3245 | +			/* | 
|---|
|  | 3246 | +			 * The capsnap should already be removed when removing | 
|---|
|  | 3247 | +			 * auth cap in the case of a forced unmount. | 
|---|
|  | 3248 | +			 */ | 
|---|
|  | 3249 | +			WARN_ON_ONCE(ci->i_auth_cap); | 
|---|
|  | 3250 | +			goto unlock; | 
|---|
|  | 3251 | +		} | 
|---|
|  | 3252 | + | 
|---|
| 2981 | 3253 | capsnap->dirty_pages -= nr; | 
|---|
| 2982 | 3254 | if (capsnap->dirty_pages == 0) { | 
|---|
| 2983 | 3255 | complete_capsnap = true; | 
|---|
| .. | .. | 
|---|
| 2999 | 3271 | complete_capsnap ? " (complete capsnap)" : ""); | 
|---|
| 3000 | 3272 | } | 
|---|
| 3001 | 3273 |  | 
|---|
|  | 3274 | +unlock: | 
|---|
| 3002 | 3275 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| 3003 | 3276 |  | 
|---|
| 3004 | 3277 | if (last) { | 
|---|
| 3005 |  | -		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 
|---|
|  | 3278 | +		ceph_check_caps(ci, 0, NULL); | 
|---|
| 3006 | 3279 | } else if (flush_snaps) { | 
|---|
| 3007 | 3280 | ceph_flush_snaps(ci, NULL); | 
|---|
| 3008 | 3281 | } | 
|---|
| 3009 | 3282 | if (complete_capsnap) | 
|---|
| 3010 | 3283 | wake_up_all(&ci->i_cap_wq); | 
|---|
| 3011 |  | -	while (put-- > 0) | 
|---|
| 3012 |  | -		iput(inode); | 
|---|
|  | 3284 | +	while (put-- > 0) { | 
|---|
|  | 3285 | +		/* avoid calling iput_final() in osd dispatch threads */ | 
|---|
|  | 3286 | +		ceph_async_iput(inode); | 
|---|
|  | 3287 | +	} | 
|---|
| 3013 | 3288 | } | 
|---|
| 3014 | 3289 |  | 
|---|
| 3015 | 3290 | /* | 
|---|
| .. | .. | 
|---|
| 3054 | 3329 | bool dirstat_valid; | 
|---|
| 3055 | 3330 | u64 nfiles; | 
|---|
| 3056 | 3331 | u64 nsubdirs; | 
|---|
|  | 3332 | +	u64 change_attr; | 
|---|
| 3057 | 3333 | /* currently issued */ | 
|---|
| 3058 | 3334 | int issued; | 
|---|
|  | 3335 | +	struct timespec64 btime; | 
|---|
| 3059 | 3336 | }; | 
|---|
| 3060 | 3337 |  | 
|---|
| 3061 | 3338 | /* | 
|---|
| .. | .. | 
|---|
| 3079 | 3356 | int used, wanted, dirty; | 
|---|
| 3080 | 3357 | u64 size = le64_to_cpu(grant->size); | 
|---|
| 3081 | 3358 | u64 max_size = le64_to_cpu(grant->max_size); | 
|---|
| 3082 |  | -	int check_caps = 0; | 
|---|
|  | 3359 | +	unsigned char check_caps = 0; | 
|---|
|  | 3360 | +	bool was_stale = cap->cap_gen < session->s_cap_gen; | 
|---|
| 3083 | 3361 | bool wake = false; | 
|---|
| 3084 | 3362 | bool writeback = false; | 
|---|
| 3085 | 3363 | bool queue_trunc = false; | 
|---|
| .. | .. | 
|---|
| 3092 | 3370 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, | 
|---|
| 3093 | 3371 | inode->i_size); | 
|---|
| 3094 | 3372 |  | 
|---|
|  | 3373 | + | 
|---|
|  | 3374 | +	/* | 
|---|
|  | 3375 | +	 * If CACHE is being revoked, and we have no dirty buffers, | 
|---|
|  | 3376 | +	 * try to invalidate (once).  (If there are dirty buffers, we | 
|---|
|  | 3377 | +	 * will invalidate _after_ writeback.) | 
|---|
|  | 3378 | +	 */ | 
|---|
|  | 3379 | +	if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ | 
|---|
|  | 3380 | +	    ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && | 
|---|
|  | 3381 | +	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && | 
|---|
|  | 3382 | +	    !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { | 
|---|
|  | 3383 | +		if (try_nonblocking_invalidate(inode)) { | 
|---|
|  | 3384 | +			/* there were locked pages.. invalidate later | 
|---|
|  | 3385 | +			   in a separate thread. */ | 
|---|
|  | 3386 | +			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { | 
|---|
|  | 3387 | +				queue_invalidate = true; | 
|---|
|  | 3388 | +				ci->i_rdcache_revoking = ci->i_rdcache_gen; | 
|---|
|  | 3389 | +			} | 
|---|
|  | 3390 | +		} | 
|---|
|  | 3391 | +	} | 
|---|
|  | 3392 | + | 
|---|
|  | 3393 | +	if (was_stale) | 
|---|
|  | 3394 | +		cap->issued = cap->implemented = CEPH_CAP_PIN; | 
|---|
| 3095 | 3395 |  | 
|---|
| 3096 | 3396 | /* | 
|---|
| 3097 | 3397 | * auth mds of the inode changed. we received the cap export message, | 
|---|
| .. | .. | 
|---|
| 3108 | 3408 | newcaps |= cap->issued; | 
|---|
| 3109 | 3409 | } | 
|---|
| 3110 | 3410 |  | 
|---|
| 3111 |  | -	/* | 
|---|
| 3112 |  | -	 * If CACHE is being revoked, and we have no dirty buffers, | 
|---|
| 3113 |  | -	 * try to invalidate (once).  (If there are dirty buffers, we | 
|---|
| 3114 |  | -	 * will invalidate _after_ writeback.) | 
|---|
| 3115 |  | -	 */ | 
|---|
| 3116 |  | -	if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ | 
|---|
| 3117 |  | -	    ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && | 
|---|
| 3118 |  | -	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && | 
|---|
| 3119 |  | -	    !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { | 
|---|
| 3120 |  | -		if (try_nonblocking_invalidate(inode)) { | 
|---|
| 3121 |  | -			/* there were locked pages.. invalidate later | 
|---|
| 3122 |  | -			   in a separate thread. */ | 
|---|
| 3123 |  | -			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { | 
|---|
| 3124 |  | -				queue_invalidate = true; | 
|---|
| 3125 |  | -				ci->i_rdcache_revoking = ci->i_rdcache_gen; | 
|---|
| 3126 |  | -			} | 
|---|
| 3127 |  | -		} | 
|---|
| 3128 |  | -	} | 
|---|
| 3129 |  | - | 
|---|
| 3130 | 3411 | /* side effects now are allowed */ | 
|---|
| 3131 | 3412 | cap->cap_gen = session->s_cap_gen; | 
|---|
| 3132 | 3413 | cap->seq = seq; | 
|---|
| 3133 | 3414 |  | 
|---|
| 3134 | 3415 | __check_cap_issue(ci, cap, newcaps); | 
|---|
| 3135 | 3416 |  | 
|---|
|  | 3417 | +	inode_set_max_iversion_raw(inode, extra_info->change_attr); | 
|---|
|  | 3418 | + | 
|---|
| 3136 | 3419 | if ((newcaps & CEPH_CAP_AUTH_SHARED) && | 
|---|
| 3137 | 3420 | (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { | 
|---|
| 3138 | 3421 | inode->i_mode = le32_to_cpu(grant->mode); | 
|---|
| 3139 | 3422 | inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); | 
|---|
| 3140 | 3423 | inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); | 
|---|
|  | 3424 | +		ci->i_btime = extra_info->btime; | 
|---|
| 3141 | 3425 | dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, | 
|---|
| 3142 | 3426 | from_kuid(&init_user_ns, inode->i_uid), | 
|---|
| 3143 | 3427 | from_kgid(&init_user_ns, inode->i_gid)); | 
|---|
| .. | .. | 
|---|
| 3164 | 3448 | ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); | 
|---|
| 3165 | 3449 | ci->i_xattrs.version = version; | 
|---|
| 3166 | 3450 | ceph_forget_all_cached_acls(inode); | 
|---|
|  | 3451 | +			ceph_security_invalidate_secctx(inode); | 
|---|
| 3167 | 3452 | } | 
|---|
| 3168 | 3453 | } | 
|---|
| 3169 | 3454 |  | 
|---|
| .. | .. | 
|---|
| 3216 | 3501 | ci->i_requested_max_size = 0; | 
|---|
| 3217 | 3502 | } | 
|---|
| 3218 | 3503 | wake = true; | 
|---|
| 3219 |  | -		} else if (ci->i_wanted_max_size > ci->i_max_size && | 
|---|
| 3220 |  | -			   ci->i_wanted_max_size > ci->i_requested_max_size) { | 
|---|
| 3221 |  | -			/* CEPH_CAP_OP_IMPORT */ | 
|---|
| 3222 |  | -			wake = true; | 
|---|
| 3223 | 3504 | } | 
|---|
| 3224 | 3505 | } | 
|---|
| 3225 | 3506 |  | 
|---|
| .. | .. | 
|---|
| 3231 | 3512 | ceph_cap_string(wanted), | 
|---|
| 3232 | 3513 | ceph_cap_string(used), | 
|---|
| 3233 | 3514 | ceph_cap_string(dirty)); | 
|---|
| 3234 |  | -	if (wanted != le32_to_cpu(grant->wanted)) { | 
|---|
| 3235 |  | -		dout("mds wanted %s -> %s\n", | 
|---|
| 3236 |  | -		     ceph_cap_string(le32_to_cpu(grant->wanted)), | 
|---|
| 3237 |  | -		     ceph_cap_string(wanted)); | 
|---|
| 3238 |  | -		/* imported cap may not have correct mds_wanted */ | 
|---|
| 3239 |  | -		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) | 
|---|
| 3240 |  | -			check_caps = 1; | 
|---|
|  | 3515 | + | 
|---|
|  | 3516 | +	if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && | 
|---|
|  | 3517 | +	    (wanted & ~(cap->mds_wanted | newcaps))) { | 
|---|
|  | 3518 | +		/* | 
|---|
|  | 3519 | +		 * If mds is importing cap, prior cap messages that update | 
|---|
|  | 3520 | +		 * 'wanted' may get dropped by mds (migrate seq mismatch). | 
|---|
|  | 3521 | +		 * | 
|---|
|  | 3522 | +		 * We don't send cap message to update 'wanted' if what we | 
|---|
|  | 3523 | +		 * want are already issued. If mds revokes caps, cap message | 
|---|
|  | 3524 | +		 * that releases caps also tells mds what we want. But if | 
|---|
|  | 3525 | +		 * caps got revoked by mds forcedly (session stale). We may | 
|---|
|  | 3526 | +		 * haven't told mds what we want. | 
|---|
|  | 3527 | +		 */ | 
|---|
|  | 3528 | +		check_caps = 1; | 
|---|
| 3241 | 3529 | } | 
|---|
| 3242 | 3530 |  | 
|---|
| 3243 | 3531 | /* revocation, grant, or no-op? */ | 
|---|
| .. | .. | 
|---|
| 3248 | 3536 | ceph_cap_string(cap->issued), | 
|---|
| 3249 | 3537 | ceph_cap_string(newcaps), | 
|---|
| 3250 | 3538 | ceph_cap_string(revoking)); | 
|---|
| 3251 |  | -		if (revoking & used & CEPH_CAP_FILE_BUFFER) | 
|---|
|  | 3539 | +		if (S_ISREG(inode->i_mode) && | 
|---|
|  | 3540 | +		    (revoking & used & CEPH_CAP_FILE_BUFFER)) | 
|---|
| 3252 | 3541 | writeback = true;  /* initiate writeback; will delay ack */ | 
|---|
| 3253 |  | -		else if (revoking == CEPH_CAP_FILE_CACHE && | 
|---|
| 3254 |  | -			 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && | 
|---|
| 3255 |  | -			 queue_invalidate) | 
|---|
|  | 3542 | +		else if (queue_invalidate && | 
|---|
|  | 3543 | +			 revoking == CEPH_CAP_FILE_CACHE && | 
|---|
|  | 3544 | +			 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) | 
|---|
| 3256 | 3545 | ; /* do nothing yet, invalidation will be queued */ | 
|---|
| 3257 | 3546 | else if (cap == ci->i_auth_cap) | 
|---|
| 3258 | 3547 | check_caps = 1; /* check auth cap only */ | 
|---|
| .. | .. | 
|---|
| 3288 | 3577 | } | 
|---|
| 3289 | 3578 |  | 
|---|
| 3290 | 3579 | if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { | 
|---|
| 3291 |  | -		if (newcaps & ~extra_info->issued) | 
|---|
| 3292 |  | -			wake = true; | 
|---|
| 3293 |  | -		kick_flushing_inode_caps(session->s_mdsc, session, inode); | 
|---|
|  | 3580 | +		if (ci->i_auth_cap == cap) { | 
|---|
|  | 3581 | +			if (newcaps & ~extra_info->issued) | 
|---|
|  | 3582 | +				wake = true; | 
|---|
|  | 3583 | + | 
|---|
|  | 3584 | +			if (ci->i_requested_max_size > max_size || | 
|---|
|  | 3585 | +			    !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { | 
|---|
|  | 3586 | +				/* re-request max_size if necessary */ | 
|---|
|  | 3587 | +				ci->i_requested_max_size = 0; | 
|---|
|  | 3588 | +				wake = true; | 
|---|
|  | 3589 | +			} | 
|---|
|  | 3590 | + | 
|---|
|  | 3591 | +			ceph_kick_flushing_inode_caps(session, ci); | 
|---|
|  | 3592 | +		} | 
|---|
| 3294 | 3593 | up_read(&session->s_mdsc->snap_rwsem); | 
|---|
| 3295 |  | -	} else { | 
|---|
| 3296 |  | -		spin_unlock(&ci->i_ceph_lock); | 
|---|
| 3297 | 3594 | } | 
|---|
|  | 3595 | +	spin_unlock(&ci->i_ceph_lock); | 
|---|
| 3298 | 3596 |  | 
|---|
| 3299 | 3597 | if (fill_inline) | 
|---|
| 3300 | 3598 | ceph_fill_inline_data(inode, NULL, extra_info->inline_data, | 
|---|
| .. | .. | 
|---|
| 3318 | 3616 | wake_up_all(&ci->i_cap_wq); | 
|---|
| 3319 | 3617 |  | 
|---|
| 3320 | 3618 | if (check_caps == 1) | 
|---|
| 3321 |  | -		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, | 
|---|
|  | 3619 | +		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, | 
|---|
| 3322 | 3620 | session); | 
|---|
| 3323 | 3621 | else if (check_caps == 2) | 
|---|
| 3324 |  | -		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); | 
|---|
|  | 3622 | +		ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); | 
|---|
| 3325 | 3623 | else | 
|---|
| 3326 | 3624 | mutex_unlock(&session->s_mutex); | 
|---|
| 3327 | 3625 | } | 
|---|
| .. | .. | 
|---|
| 3348 | 3646 | bool wake_mdsc = false; | 
|---|
| 3349 | 3647 |  | 
|---|
| 3350 | 3648 | list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { | 
|---|
|  | 3649 | +		/* Is this the one that was flushed? */ | 
|---|
| 3351 | 3650 | if (cf->tid == flush_tid) | 
|---|
| 3352 | 3651 | cleaned = cf->caps; | 
|---|
| 3353 |  | -		if (cf->caps == 0) /* capsnap */ | 
|---|
|  | 3652 | + | 
|---|
|  | 3653 | +		/* Is this a capsnap? */ | 
|---|
|  | 3654 | +		if (cf->is_capsnap) | 
|---|
| 3354 | 3655 | continue; | 
|---|
|  | 3656 | + | 
|---|
| 3355 | 3657 | if (cf->tid <= flush_tid) { | 
|---|
| 3356 |  | -			if (__finish_cap_flush(NULL, ci, cf)) | 
|---|
| 3357 |  | -				wake_ci = true; | 
|---|
|  | 3658 | +			/* | 
|---|
|  | 3659 | +			 * An earlier or current tid. The FLUSH_ACK should | 
|---|
|  | 3660 | +			 * represent a superset of this flush's caps. | 
|---|
|  | 3661 | +			 */ | 
|---|
|  | 3662 | +			wake_ci |= __detach_cap_flush_from_ci(ci, cf); | 
|---|
| 3358 | 3663 | list_add_tail(&cf->i_list, &to_remove); | 
|---|
| 3359 | 3664 | } else { | 
|---|
|  | 3665 | +			/* | 
|---|
|  | 3666 | +			 * This is a later one. Any caps in it are still dirty | 
|---|
|  | 3667 | +			 * so don't count them as cleaned. | 
|---|
|  | 3668 | +			 */ | 
|---|
| 3360 | 3669 | cleaned &= ~cf->caps; | 
|---|
| 3361 | 3670 | if (!cleaned) | 
|---|
| 3362 | 3671 | break; | 
|---|
| .. | .. | 
|---|
| 3376 | 3685 |  | 
|---|
| 3377 | 3686 | spin_lock(&mdsc->cap_dirty_lock); | 
|---|
| 3378 | 3687 |  | 
|---|
| 3379 |  | -	list_for_each_entry(cf, &to_remove, i_list) { | 
|---|
| 3380 |  | -		if (__finish_cap_flush(mdsc, NULL, cf)) | 
|---|
| 3381 |  | -			wake_mdsc = true; | 
|---|
| 3382 |  | -	} | 
|---|
|  | 3688 | +	list_for_each_entry(cf, &to_remove, i_list) | 
|---|
|  | 3689 | +		wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); | 
|---|
| 3383 | 3690 |  | 
|---|
| 3384 | 3691 | if (ci->i_flushing_caps == 0) { | 
|---|
| 3385 | 3692 | if (list_empty(&ci->i_cap_flush_list)) { | 
|---|
| .. | .. | 
|---|
| 3417 | 3724 | while (!list_empty(&to_remove)) { | 
|---|
| 3418 | 3725 | cf = list_first_entry(&to_remove, | 
|---|
| 3419 | 3726 | struct ceph_cap_flush, i_list); | 
|---|
| 3420 |  | -		list_del(&cf->i_list); | 
|---|
| 3421 |  | -		ceph_free_cap_flush(cf); | 
|---|
|  | 3727 | +		list_del_init(&cf->i_list); | 
|---|
|  | 3728 | +		if (!cf->is_capsnap) | 
|---|
|  | 3729 | +			ceph_free_cap_flush(cf); | 
|---|
| 3422 | 3730 | } | 
|---|
| 3423 | 3731 |  | 
|---|
| 3424 | 3732 | if (wake_ci) | 
|---|
| .. | .. | 
|---|
| 3427 | 3735 | wake_up_all(&mdsc->cap_flushing_wq); | 
|---|
| 3428 | 3736 | if (drop) | 
|---|
| 3429 | 3737 | iput(inode); | 
|---|
|  | 3738 | +} | 
|---|
|  | 3739 | + | 
|---|
|  | 3740 | +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, | 
|---|
|  | 3741 | +			   bool *wake_ci, bool *wake_mdsc) | 
|---|
|  | 3742 | +{ | 
|---|
|  | 3743 | +	struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
|  | 3744 | +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 
|---|
|  | 3745 | +	bool ret; | 
|---|
|  | 3746 | + | 
|---|
|  | 3747 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
|  | 3748 | + | 
|---|
|  | 3749 | +	dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); | 
|---|
|  | 3750 | + | 
|---|
|  | 3751 | +	list_del_init(&capsnap->ci_item); | 
|---|
|  | 3752 | +	ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); | 
|---|
|  | 3753 | +	if (wake_ci) | 
|---|
|  | 3754 | +		*wake_ci = ret; | 
|---|
|  | 3755 | + | 
|---|
|  | 3756 | +	spin_lock(&mdsc->cap_dirty_lock); | 
|---|
|  | 3757 | +	if (list_empty(&ci->i_cap_flush_list)) | 
|---|
|  | 3758 | +		list_del_init(&ci->i_flushing_item); | 
|---|
|  | 3759 | + | 
|---|
|  | 3760 | +	ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); | 
|---|
|  | 3761 | +	if (wake_mdsc) | 
|---|
|  | 3762 | +		*wake_mdsc = ret; | 
|---|
|  | 3763 | +	spin_unlock(&mdsc->cap_dirty_lock); | 
|---|
|  | 3764 | +} | 
|---|
|  | 3765 | + | 
|---|
|  | 3766 | +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, | 
|---|
|  | 3767 | +			 bool *wake_ci, bool *wake_mdsc) | 
|---|
|  | 3768 | +{ | 
|---|
|  | 3769 | +	struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
|  | 3770 | + | 
|---|
|  | 3771 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
|  | 3772 | + | 
|---|
|  | 3773 | +	WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); | 
|---|
|  | 3774 | +	__ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); | 
|---|
| 3430 | 3775 | } | 
|---|
| 3431 | 3776 |  | 
|---|
| 3432 | 3777 | /* | 
|---|
| .. | .. | 
|---|
| 3466 | 3811 | capsnap, capsnap->follows); | 
|---|
| 3467 | 3812 | } | 
|---|
| 3468 | 3813 | } | 
|---|
| 3469 |  | -	if (flushed) { | 
|---|
| 3470 |  | -		WARN_ON(capsnap->dirty_pages || capsnap->writing); | 
|---|
| 3471 |  | -		dout(" removing %p cap_snap %p follows %lld\n", | 
|---|
| 3472 |  | -		     inode, capsnap, follows); | 
|---|
| 3473 |  | -		list_del(&capsnap->ci_item); | 
|---|
| 3474 |  | -		if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush)) | 
|---|
| 3475 |  | -			wake_ci = true; | 
|---|
| 3476 |  | - | 
|---|
| 3477 |  | -		spin_lock(&mdsc->cap_dirty_lock); | 
|---|
| 3478 |  | - | 
|---|
| 3479 |  | -		if (list_empty(&ci->i_cap_flush_list)) | 
|---|
| 3480 |  | -			list_del_init(&ci->i_flushing_item); | 
|---|
| 3481 |  | - | 
|---|
| 3482 |  | -		if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush)) | 
|---|
| 3483 |  | -			wake_mdsc = true; | 
|---|
| 3484 |  | - | 
|---|
| 3485 |  | -		spin_unlock(&mdsc->cap_dirty_lock); | 
|---|
| 3486 |  | -	} | 
|---|
|  | 3814 | +	if (flushed) | 
|---|
|  | 3815 | +		ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); | 
|---|
| 3487 | 3816 | spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 3817 | + | 
|---|
| 3488 | 3818 | if (flushed) { | 
|---|
| 3489 | 3819 | ceph_put_snap_context(capsnap->context); | 
|---|
| 3490 | 3820 | ceph_put_cap_snap(capsnap); | 
|---|
| .. | .. | 
|---|
| 3501 | 3831 | * | 
|---|
| 3502 | 3832 | * caller hold s_mutex. | 
|---|
| 3503 | 3833 | */ | 
|---|
| 3504 |  | -static void handle_cap_trunc(struct inode *inode, | 
|---|
|  | 3834 | +static bool handle_cap_trunc(struct inode *inode, | 
|---|
| 3505 | 3835 | struct ceph_mds_caps *trunc, | 
|---|
| 3506 | 3836 | struct ceph_mds_session *session) | 
|---|
| 3507 |  | -	__releases(ci->i_ceph_lock) | 
|---|
| 3508 | 3837 | { | 
|---|
| 3509 | 3838 | struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 3510 | 3839 | int mds = session->s_mds; | 
|---|
| .. | .. | 
|---|
| 3515 | 3844 | int implemented = 0; | 
|---|
| 3516 | 3845 | int dirty = __ceph_caps_dirty(ci); | 
|---|
| 3517 | 3846 | int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); | 
|---|
| 3518 |  | -	int queue_trunc = 0; | 
|---|
|  | 3847 | +	bool queue_trunc = false; | 
|---|
|  | 3848 | + | 
|---|
|  | 3849 | +	lockdep_assert_held(&ci->i_ceph_lock); | 
|---|
| 3519 | 3850 |  | 
|---|
| 3520 | 3851 | issued |= implemented | dirty; | 
|---|
| 3521 | 3852 |  | 
|---|
| .. | .. | 
|---|
| 3523 | 3854 | inode, mds, seq, truncate_size, truncate_seq); | 
|---|
| 3524 | 3855 | queue_trunc = ceph_fill_file_size(inode, issued, | 
|---|
| 3525 | 3856 | truncate_seq, truncate_size, size); | 
|---|
| 3526 |  | -	spin_unlock(&ci->i_ceph_lock); | 
|---|
| 3527 |  | - | 
|---|
| 3528 |  | -	if (queue_trunc) | 
|---|
| 3529 |  | -		ceph_queue_vmtruncate(inode); | 
|---|
|  | 3857 | +	return queue_trunc; | 
|---|
| 3530 | 3858 | } | 
|---|
| 3531 | 3859 |  | 
|---|
| 3532 | 3860 | /* | 
|---|
| .. | .. | 
|---|
| 3571 | 3899 |  | 
|---|
| 3572 | 3900 | if (target < 0) { | 
|---|
| 3573 | 3901 | __ceph_remove_cap(cap, false); | 
|---|
| 3574 |  | -		if (!ci->i_auth_cap) | 
|---|
| 3575 |  | -			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; | 
|---|
| 3576 | 3902 | goto out_unlock; | 
|---|
| 3577 | 3903 | } | 
|---|
| 3578 | 3904 |  | 
|---|
| .. | .. | 
|---|
| 3602 | 3928 | tcap->issue_seq = t_seq - 1; | 
|---|
| 3603 | 3929 | tcap->issued |= issued; | 
|---|
| 3604 | 3930 | tcap->implemented |= issued; | 
|---|
| 3605 |  | -			if (cap == ci->i_auth_cap) | 
|---|
|  | 3931 | +			if (cap == ci->i_auth_cap) { | 
|---|
| 3606 | 3932 | ci->i_auth_cap = tcap; | 
|---|
| 3607 |  | - | 
|---|
| 3608 |  | -			if (!list_empty(&ci->i_cap_flush_list) && | 
|---|
| 3609 |  | -			    ci->i_auth_cap == tcap) { | 
|---|
| 3610 |  | -				spin_lock(&mdsc->cap_dirty_lock); | 
|---|
| 3611 |  | -				list_move_tail(&ci->i_flushing_item, | 
|---|
| 3612 |  | -					       &tcap->session->s_cap_flushing); | 
|---|
| 3613 |  | -				spin_unlock(&mdsc->cap_dirty_lock); | 
|---|
|  | 3933 | +				change_auth_cap_ses(ci, tcap->session); | 
|---|
| 3614 | 3934 | } | 
|---|
| 3615 | 3935 | } | 
|---|
| 3616 | 3936 | __ceph_remove_cap(cap, false); | 
|---|
| .. | .. | 
|---|
| 3619 | 3939 | /* add placeholder for the export tagert */ | 
|---|
| 3620 | 3940 | int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; | 
|---|
| 3621 | 3941 | tcap = new_cap; | 
|---|
| 3622 |  | -		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, | 
|---|
|  | 3942 | +		ceph_add_cap(inode, tsession, t_cap_id, issued, 0, | 
|---|
| 3623 | 3943 | t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); | 
|---|
| 3624 | 3944 |  | 
|---|
| 3625 | 3945 | if (!list_empty(&ci->i_cap_flush_list) && | 
|---|
| .. | .. | 
|---|
| 3679 | 3999 | struct ceph_mds_cap_peer *ph, | 
|---|
| 3680 | 4000 | struct ceph_mds_session *session, | 
|---|
| 3681 | 4001 | struct ceph_cap **target_cap, int *old_issued) | 
|---|
| 3682 |  | -	__acquires(ci->i_ceph_lock) | 
|---|
| 3683 | 4002 | { | 
|---|
| 3684 | 4003 | struct ceph_inode_info *ci = ceph_inode(inode); | 
|---|
| 3685 | 4004 | struct ceph_cap *cap, *ocap, *new_cap = NULL; | 
|---|
| .. | .. | 
|---|
| 3704 | 4023 |  | 
|---|
| 3705 | 4024 | dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", | 
|---|
| 3706 | 4025 | inode, ci, mds, mseq, peer); | 
|---|
| 3707 |  | - | 
|---|
| 3708 | 4026 | retry: | 
|---|
| 3709 |  | -	spin_lock(&ci->i_ceph_lock); | 
|---|
| 3710 | 4027 | cap = __get_cap_for_mds(ci, mds); | 
|---|
| 3711 | 4028 | if (!cap) { | 
|---|
| 3712 | 4029 | if (!new_cap) { | 
|---|
| 3713 | 4030 | spin_unlock(&ci->i_ceph_lock); | 
|---|
| 3714 | 4031 | new_cap = ceph_get_cap(mdsc, NULL); | 
|---|
|  | 4032 | +			spin_lock(&ci->i_ceph_lock); | 
|---|
| 3715 | 4033 | goto retry; | 
|---|
| 3716 | 4034 | } | 
|---|
| 3717 | 4035 | cap = new_cap; | 
|---|
| .. | .. | 
|---|
| 3725 | 4043 | __ceph_caps_issued(ci, &issued); | 
|---|
| 3726 | 4044 | issued |= __ceph_caps_dirty(ci); | 
|---|
| 3727 | 4045 |  | 
|---|
| 3728 |  | -	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, | 
|---|
|  | 4046 | +	ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, | 
|---|
| 3729 | 4047 | realmino, CEPH_CAP_FLAG_AUTH, &new_cap); | 
|---|
| 3730 | 4048 |  | 
|---|
| 3731 | 4049 | ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; | 
|---|
| .. | .. | 
|---|
| 3745 | 4063 | } | 
|---|
| 3746 | 4064 | __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); | 
|---|
| 3747 | 4065 | } | 
|---|
| 3748 |  | - | 
|---|
| 3749 |  | -	/* make sure we re-request max_size, if necessary */ | 
|---|
| 3750 |  | -	ci->i_requested_max_size = 0; | 
|---|
| 3751 | 4066 |  | 
|---|
| 3752 | 4067 | *old_issued = issued; | 
|---|
| 3753 | 4068 | *target_cap = cap; | 
|---|
| .. | .. | 
|---|
| 3777 | 4092 | size_t snaptrace_len; | 
|---|
| 3778 | 4093 | void *p, *end; | 
|---|
| 3779 | 4094 | struct cap_extra_info extra_info = {}; | 
|---|
|  | 4095 | +	bool queue_trunc; | 
|---|
| 3780 | 4096 |  | 
|---|
| 3781 | 4097 | dout("handle_caps from mds%d\n", session->s_mds); | 
|---|
| 3782 | 4098 |  | 
|---|
| .. | .. | 
|---|
| 3852 | 4168 | } | 
|---|
| 3853 | 4169 | } | 
|---|
| 3854 | 4170 |  | 
|---|
| 3855 |  | -	if (msg_version >= 11) { | 
|---|
|  | 4171 | +	if (msg_version >= 9) { | 
|---|
| 3856 | 4172 | struct ceph_timespec *btime; | 
|---|
| 3857 |  | -		u64 change_attr; | 
|---|
| 3858 |  | -		u32 flags; | 
|---|
| 3859 | 4173 |  | 
|---|
| 3860 |  | -		/* version >= 9 */ | 
|---|
| 3861 | 4174 | if (p + sizeof(*btime) > end) | 
|---|
| 3862 | 4175 | goto bad; | 
|---|
| 3863 | 4176 | btime = p; | 
|---|
|  | 4177 | +		ceph_decode_timespec64(&extra_info.btime, btime); | 
|---|
| 3864 | 4178 | p += sizeof(*btime); | 
|---|
| 3865 |  | -		ceph_decode_64_safe(&p, end, change_attr, bad); | 
|---|
|  | 4179 | +		ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); | 
|---|
|  | 4180 | +	} | 
|---|
|  | 4181 | + | 
|---|
|  | 4182 | +	if (msg_version >= 11) { | 
|---|
|  | 4183 | +		u32 flags; | 
|---|
| 3866 | 4184 | /* version >= 10 */ | 
|---|
| 3867 | 4185 | ceph_decode_32_safe(&p, end, flags, bad); | 
|---|
| 3868 | 4186 | /* version >= 11 */ | 
|---|
| .. | .. | 
|---|
| 3878 | 4196 | vino.snap, inode); | 
|---|
| 3879 | 4197 |  | 
|---|
| 3880 | 4198 | mutex_lock(&session->s_mutex); | 
|---|
| 3881 |  | -	session->s_seq++; | 
|---|
|  | 4199 | +	inc_session_sequence(session); | 
|---|
| 3882 | 4200 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, | 
|---|
| 3883 | 4201 | (unsigned)seq); | 
|---|
| 3884 | 4202 |  | 
|---|
| .. | .. | 
|---|
| 3894 | 4212 | cap->seq = seq; | 
|---|
| 3895 | 4213 | cap->issue_seq = seq; | 
|---|
| 3896 | 4214 | spin_lock(&session->s_cap_lock); | 
|---|
| 3897 |  | -			list_add_tail(&cap->session_caps, | 
|---|
| 3898 |  | -					&session->s_cap_releases); | 
|---|
| 3899 |  | -			session->s_num_cap_releases++; | 
|---|
|  | 4215 | +			__ceph_queue_cap_release(session, cap); | 
|---|
| 3900 | 4216 | spin_unlock(&session->s_cap_lock); | 
|---|
| 3901 | 4217 | } | 
|---|
| 3902 | 4218 | goto flush_cap_releases; | 
|---|
| .. | .. | 
|---|
| 3924 | 4240 | } else { | 
|---|
| 3925 | 4241 | down_read(&mdsc->snap_rwsem); | 
|---|
| 3926 | 4242 | } | 
|---|
|  | 4243 | +		spin_lock(&ci->i_ceph_lock); | 
|---|
| 3927 | 4244 | handle_cap_import(mdsc, inode, h, peer, session, | 
|---|
| 3928 | 4245 | &cap, &extra_info.issued); | 
|---|
| 3929 | 4246 | handle_cap_grant(inode, session, cap, | 
|---|
| .. | .. | 
|---|
| 3960 | 4277 | break; | 
|---|
| 3961 | 4278 |  | 
|---|
| 3962 | 4279 | case CEPH_CAP_OP_TRUNC: | 
|---|
| 3963 |  | -		handle_cap_trunc(inode, h, session); | 
|---|
|  | 4280 | +		queue_trunc = handle_cap_trunc(inode, h, session); | 
|---|
|  | 4281 | +		spin_unlock(&ci->i_ceph_lock); | 
|---|
|  | 4282 | +		if (queue_trunc) | 
|---|
|  | 4283 | +			ceph_queue_vmtruncate(inode); | 
|---|
| 3964 | 4284 | break; | 
|---|
| 3965 | 4285 |  | 
|---|
| 3966 | 4286 | default: | 
|---|
| .. | .. | 
|---|
| 3969 | 4289 | ceph_cap_op_name(op)); | 
|---|
| 3970 | 4290 | } | 
|---|
| 3971 | 4291 |  | 
|---|
| 3972 |  | -	goto done; | 
|---|
|  | 4292 | +done: | 
|---|
|  | 4293 | +	mutex_unlock(&session->s_mutex); | 
|---|
|  | 4294 | +done_unlocked: | 
|---|
|  | 4295 | +	ceph_put_string(extra_info.pool_ns); | 
|---|
|  | 4296 | +	/* avoid calling iput_final() in mds dispatch threads */ | 
|---|
|  | 4297 | +	ceph_async_iput(inode); | 
|---|
|  | 4298 | +	return; | 
|---|
| 3973 | 4299 |  | 
|---|
| 3974 | 4300 | flush_cap_releases: | 
|---|
| 3975 | 4301 | /* | 
|---|
| .. | .. | 
|---|
| 3977 | 4303 | * along for the mds (who clearly thinks we still have this | 
|---|
| 3978 | 4304 | * cap). | 
|---|
| 3979 | 4305 | */ | 
|---|
| 3980 |  | -	ceph_send_cap_releases(mdsc, session); | 
|---|
| 3981 |  | - | 
|---|
| 3982 |  | -done: | 
|---|
| 3983 |  | -	mutex_unlock(&session->s_mutex); | 
|---|
| 3984 |  | -done_unlocked: | 
|---|
| 3985 |  | -	iput(inode); | 
|---|
| 3986 |  | -	ceph_put_string(extra_info.pool_ns); | 
|---|
| 3987 |  | -	return; | 
|---|
|  | 4306 | +	ceph_flush_cap_releases(mdsc, session); | 
|---|
|  | 4307 | +	goto done; | 
|---|
| 3988 | 4308 |  | 
|---|
| 3989 | 4309 | bad: | 
|---|
| 3990 | 4310 | pr_err("ceph_handle_caps: corrupt message\n"); | 
|---|
| .. | .. | 
|---|
| 3994 | 4314 |  | 
|---|
| 3995 | 4315 | /* | 
|---|
| 3996 | 4316 | * Delayed work handler to process end of delayed cap release LRU list. | 
|---|
|  | 4317 | + * | 
|---|
|  | 4318 | + * If new caps are added to the list while processing it, these won't get | 
|---|
|  | 4319 | + * processed in this run.  In this case, the ci->i_hold_caps_max will be | 
|---|
|  | 4320 | + * returned so that the work can be scheduled accordingly. | 
|---|
| 3997 | 4321 | */ | 
|---|
| 3998 |  | -void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) | 
|---|
|  | 4322 | +unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) | 
|---|
| 3999 | 4323 | { | 
|---|
| 4000 | 4324 | struct inode *inode; | 
|---|
| 4001 | 4325 | struct ceph_inode_info *ci; | 
|---|
| 4002 |  | -	int flags = CHECK_CAPS_NODELAY; | 
|---|
|  | 4326 | +	struct ceph_mount_options *opt = mdsc->fsc->mount_options; | 
|---|
|  | 4327 | +	unsigned long delay_max = opt->caps_wanted_delay_max * HZ; | 
|---|
|  | 4328 | +	unsigned long loop_start = jiffies; | 
|---|
|  | 4329 | +	unsigned long delay = 0; | 
|---|
| 4003 | 4330 |  | 
|---|
| 4004 | 4331 | dout("check_delayed_caps\n"); | 
|---|
| 4005 |  | -	while (1) { | 
|---|
| 4006 |  | -		spin_lock(&mdsc->cap_delay_lock); | 
|---|
| 4007 |  | -		if (list_empty(&mdsc->cap_delay_list)) | 
|---|
| 4008 |  | -			break; | 
|---|
|  | 4332 | +	spin_lock(&mdsc->cap_delay_lock); | 
|---|
|  | 4333 | +	while (!list_empty(&mdsc->cap_delay_list)) { | 
|---|
| 4009 | 4334 | ci = list_first_entry(&mdsc->cap_delay_list, | 
|---|
| 4010 | 4335 | struct ceph_inode_info, | 
|---|
| 4011 | 4336 | i_cap_delay_list); | 
|---|
|  | 4337 | +		if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { | 
|---|
|  | 4338 | +			dout("%s caps added recently.  Exiting loop", __func__); | 
|---|
|  | 4339 | +			delay = ci->i_hold_caps_max; | 
|---|
|  | 4340 | +			break; | 
|---|
|  | 4341 | +		} | 
|---|
| 4012 | 4342 | if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && | 
|---|
| 4013 | 4343 | time_before(jiffies, ci->i_hold_caps_max)) | 
|---|
| 4014 | 4344 | break; | 
|---|
| 4015 | 4345 | list_del_init(&ci->i_cap_delay_list); | 
|---|
| 4016 | 4346 |  | 
|---|
| 4017 | 4347 | inode = igrab(&ci->vfs_inode); | 
|---|
| 4018 |  | -		spin_unlock(&mdsc->cap_delay_lock); | 
|---|
| 4019 |  | - | 
|---|
| 4020 | 4348 | if (inode) { | 
|---|
|  | 4349 | +			spin_unlock(&mdsc->cap_delay_lock); | 
|---|
| 4021 | 4350 | dout("check_delayed_caps on %p\n", inode); | 
|---|
| 4022 |  | -			ceph_check_caps(ci, flags, NULL); | 
|---|
| 4023 |  | -			iput(inode); | 
|---|
|  | 4351 | +			ceph_check_caps(ci, 0, NULL); | 
|---|
|  | 4352 | +			/* avoid calling iput_final() in tick thread */ | 
|---|
|  | 4353 | +			ceph_async_iput(inode); | 
|---|
|  | 4354 | +			spin_lock(&mdsc->cap_delay_lock); | 
|---|
| 4024 | 4355 | } | 
|---|
| 4025 | 4356 | } | 
|---|
| 4026 | 4357 | spin_unlock(&mdsc->cap_delay_lock); | 
|---|
|  | 4358 | + | 
|---|
|  | 4359 | +	return delay; | 
|---|
| 4027 | 4360 | } | 
|---|
| 4028 | 4361 |  | 
|---|
| 4029 | 4362 | /* | 
|---|
| 4030 | 4363 | * Flush all dirty caps to the mds | 
|---|
| 4031 | 4364 | */ | 
|---|
| 4032 |  | -void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) | 
|---|
|  | 4365 | +static void flush_dirty_session_caps(struct ceph_mds_session *s) | 
|---|
| 4033 | 4366 | { | 
|---|
|  | 4367 | +	struct ceph_mds_client *mdsc = s->s_mdsc; | 
|---|
| 4034 | 4368 | struct ceph_inode_info *ci; | 
|---|
| 4035 | 4369 | struct inode *inode; | 
|---|
| 4036 | 4370 |  | 
|---|
| 4037 | 4371 | dout("flush_dirty_caps\n"); | 
|---|
| 4038 | 4372 | spin_lock(&mdsc->cap_dirty_lock); | 
|---|
| 4039 |  | -	while (!list_empty(&mdsc->cap_dirty)) { | 
|---|
| 4040 |  | -		ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, | 
|---|
|  | 4373 | +	while (!list_empty(&s->s_cap_dirty)) { | 
|---|
|  | 4374 | +		ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, | 
|---|
| 4041 | 4375 | i_dirty_item); | 
|---|
| 4042 | 4376 | inode = &ci->vfs_inode; | 
|---|
| 4043 | 4377 | ihold(inode); | 
|---|
| 4044 | 4378 | dout("flush_dirty_caps %p\n", inode); | 
|---|
| 4045 | 4379 | spin_unlock(&mdsc->cap_dirty_lock); | 
|---|
| 4046 |  | -		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); | 
|---|
|  | 4380 | +		ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); | 
|---|
| 4047 | 4381 | iput(inode); | 
|---|
| 4048 | 4382 | spin_lock(&mdsc->cap_dirty_lock); | 
|---|
| 4049 | 4383 | } | 
|---|
| .. | .. | 
|---|
| 4051 | 4385 | dout("flush_dirty_caps done\n"); | 
|---|
| 4052 | 4386 | } | 
|---|
| 4053 | 4387 |  | 
|---|
| 4054 |  | -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) | 
|---|
|  | 4388 | +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) | 
|---|
| 4055 | 4389 | { | 
|---|
| 4056 |  | -	int i; | 
|---|
|  | 4390 | +	ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); | 
|---|
|  | 4391 | +} | 
|---|
|  | 4392 | + | 
|---|
|  | 4393 | +void __ceph_touch_fmode(struct ceph_inode_info *ci, | 
|---|
|  | 4394 | +			struct ceph_mds_client *mdsc, int fmode) | 
|---|
|  | 4395 | +{ | 
|---|
|  | 4396 | +	unsigned long now = jiffies; | 
|---|
|  | 4397 | +	if (fmode & CEPH_FILE_MODE_RD) | 
|---|
|  | 4398 | +		ci->i_last_rd = now; | 
|---|
|  | 4399 | +	if (fmode & CEPH_FILE_MODE_WR) | 
|---|
|  | 4400 | +		ci->i_last_wr = now; | 
|---|
|  | 4401 | +	/* queue periodic check */ | 
|---|
|  | 4402 | +	if (fmode && | 
|---|
|  | 4403 | +	    __ceph_is_any_real_caps(ci) && | 
|---|
|  | 4404 | +	    list_empty(&ci->i_cap_delay_list)) | 
|---|
|  | 4405 | +		__cap_delay_requeue(mdsc, ci); | 
|---|
|  | 4406 | +} | 
|---|
|  | 4407 | + | 
|---|
|  | 4408 | +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) | 
|---|
|  | 4409 | +{ | 
|---|
|  | 4410 | +	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); | 
|---|
| 4057 | 4411 | int bits = (fmode << 1) | 1; | 
|---|
|  | 4412 | +	bool already_opened = false; | 
|---|
|  | 4413 | +	int i; | 
|---|
|  | 4414 | + | 
|---|
|  | 4415 | +	if (count == 1) | 
|---|
|  | 4416 | +		atomic64_inc(&mdsc->metric.opened_files); | 
|---|
|  | 4417 | + | 
|---|
|  | 4418 | +	spin_lock(&ci->i_ceph_lock); | 
|---|
| 4058 | 4419 | for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { | 
|---|
|  | 4420 | +		/* | 
|---|
|  | 4421 | +		 * If any of the mode ref is larger than 0, | 
|---|
|  | 4422 | +		 * that means it has been already opened by | 
|---|
|  | 4423 | +		 * others. Just skip checking the PIN ref. | 
|---|
|  | 4424 | +		 */ | 
|---|
|  | 4425 | +		if (i && ci->i_nr_by_mode[i]) | 
|---|
|  | 4426 | +			already_opened = true; | 
|---|
|  | 4427 | + | 
|---|
| 4059 | 4428 | if (bits & (1 << i)) | 
|---|
| 4060 |  | -			ci->i_nr_by_mode[i]++; | 
|---|
|  | 4429 | +			ci->i_nr_by_mode[i] += count; | 
|---|
| 4061 | 4430 | } | 
|---|
|  | 4431 | + | 
|---|
|  | 4432 | +	if (!already_opened) | 
|---|
|  | 4433 | +		percpu_counter_inc(&mdsc->metric.opened_inodes); | 
|---|
|  | 4434 | +	spin_unlock(&ci->i_ceph_lock); | 
|---|
| 4062 | 4435 | } | 
|---|
| 4063 | 4436 |  | 
|---|
| 4064 | 4437 | /* | 
|---|
| .. | .. | 
|---|
| 4066 | 4439 | * we may need to release capabilities to the MDS (or schedule | 
|---|
| 4067 | 4440 | * their delayed release). | 
|---|
| 4068 | 4441 | */ | 
|---|
| 4069 |  | -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) | 
|---|
|  | 4442 | +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) | 
|---|
| 4070 | 4443 | { | 
|---|
| 4071 |  | -	int i, last = 0; | 
|---|
|  | 4444 | +	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); | 
|---|
| 4072 | 4445 | int bits = (fmode << 1) | 1; | 
|---|
|  | 4446 | +	bool is_closed = true; | 
|---|
|  | 4447 | +	int i; | 
|---|
|  | 4448 | + | 
|---|
|  | 4449 | +	if (count == 1) | 
|---|
|  | 4450 | +		atomic64_dec(&mdsc->metric.opened_files); | 
|---|
|  | 4451 | + | 
|---|
| 4073 | 4452 | spin_lock(&ci->i_ceph_lock); | 
|---|
| 4074 | 4453 | for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { | 
|---|
| 4075 | 4454 | if (bits & (1 << i)) { | 
|---|
| 4076 |  | -			BUG_ON(ci->i_nr_by_mode[i] == 0); | 
|---|
| 4077 |  | -			if (--ci->i_nr_by_mode[i] == 0) | 
|---|
| 4078 |  | -				last++; | 
|---|
|  | 4455 | +			BUG_ON(ci->i_nr_by_mode[i] < count); | 
|---|
|  | 4456 | +			ci->i_nr_by_mode[i] -= count; | 
|---|
| 4079 | 4457 | } | 
|---|
| 4080 |  | -	} | 
|---|
| 4081 |  | -	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", | 
|---|
| 4082 |  | -	     &ci->vfs_inode, fmode, | 
|---|
| 4083 |  | -	     ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], | 
|---|
| 4084 |  | -	     ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); | 
|---|
| 4085 |  | -	spin_unlock(&ci->i_ceph_lock); | 
|---|
| 4086 | 4458 |  | 
|---|
| 4087 |  | -	if (last && ci->i_vino.snap == CEPH_NOSNAP) | 
|---|
| 4088 |  | -		ceph_check_caps(ci, 0, NULL); | 
|---|
|  | 4459 | +		/* | 
|---|
|  | 4460 | +		 * If any of the mode ref is not 0 after | 
|---|
|  | 4461 | +		 * decreased, that means it is still opened | 
|---|
|  | 4462 | +		 * by others. Just skip checking the PIN ref. | 
|---|
|  | 4463 | +		 */ | 
|---|
|  | 4464 | +		if (i && ci->i_nr_by_mode[i]) | 
|---|
|  | 4465 | +			is_closed = false; | 
|---|
|  | 4466 | +	} | 
|---|
|  | 4467 | + | 
|---|
|  | 4468 | +	if (is_closed) | 
|---|
|  | 4469 | +		percpu_counter_dec(&mdsc->metric.opened_inodes); | 
|---|
|  | 4470 | +	spin_unlock(&ci->i_ceph_lock); | 
|---|
| 4089 | 4471 | } | 
|---|
| 4090 | 4472 |  | 
|---|
| 4091 | 4473 | /* | 
|---|
| 4092 |  | - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it | 
|---|
|  | 4474 | + * For a soon-to-be unlinked file, drop the LINK caps. If it | 
|---|
| 4093 | 4475 | * looks like the link count will hit 0, drop any other caps (other | 
|---|
| 4094 | 4476 | * than PIN) we don't specifically want (due to the file still being | 
|---|
| 4095 | 4477 | * open). | 
|---|
| .. | .. | 
|---|
| 4103 | 4485 | if (inode->i_nlink == 1) { | 
|---|
| 4104 | 4486 | drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); | 
|---|
| 4105 | 4487 |  | 
|---|
| 4106 |  | -		ci->i_ceph_flags |= CEPH_I_NODELAY; | 
|---|
| 4107 | 4488 | if (__ceph_caps_dirty(ci)) { | 
|---|
| 4108 | 4489 | struct ceph_mds_client *mdsc = | 
|---|
| 4109 | 4490 | ceph_inode_to_client(inode)->mdsc; | 
|---|
| .. | .. | 
|---|
| 4159 | 4540 | if (force || (cap->issued & drop)) { | 
|---|
| 4160 | 4541 | if (cap->issued & drop) { | 
|---|
| 4161 | 4542 | int wanted = __ceph_caps_wanted(ci); | 
|---|
| 4162 |  | -				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) | 
|---|
| 4163 |  | -					wanted |= cap->mds_wanted; | 
|---|
| 4164 | 4543 | dout("encode_inode_release %p cap %p " | 
|---|
| 4165 | 4544 | "%s -> %s, wanted %s -> %s\n", inode, cap, | 
|---|
| 4166 | 4545 | ceph_cap_string(cap->issued), | 
|---|
| .. | .. | 
|---|
| 4171 | 4550 | cap->issued &= ~drop; | 
|---|
| 4172 | 4551 | cap->implemented &= ~drop; | 
|---|
| 4173 | 4552 | cap->mds_wanted = wanted; | 
|---|
|  | 4553 | +				if (cap == ci->i_auth_cap && | 
|---|
|  | 4554 | +				    !(wanted & CEPH_CAP_ANY_FILE_WR)) | 
|---|
|  | 4555 | +					ci->i_requested_max_size = 0; | 
|---|
| 4174 | 4556 | } else { | 
|---|
| 4175 | 4557 | dout("encode_inode_release %p cap %p %s" | 
|---|
| 4176 | 4558 | " (force)\n", inode, cap, | 
|---|