~hc/RK356X_SDK_RELEASE.git

..	..	@@ -13,6 +13,7 @@
13	13	#include <linux/posix_acl.h>
14	14	#include <linux/random.h>
15	15	#include <linux/sort.h>
	16	+#include <linux/iversion.h>
16	17
17	18	#include "super.h"
18	19	#include "mds_client.h"
..	..	@@ -33,36 +34,38 @@
33	34
34	35	static const struct inode_operations ceph_symlink_iops;
35	36
36		-static void ceph_invalidate_work(struct work_struct *work);
37		-static void ceph_writeback_work(struct work_struct *work);
38		-static void ceph_vmtruncate_work(struct work_struct *work);
	37	+static void ceph_inode_work(struct work_struct *work);
39	38
40	39	/*
41	40	* find or create an inode, given the ceph ino number
42	41	*/
43	42	static int ceph_set_ino_cb(struct inode inode, void data)
44	43	{
45		- ceph_inode(inode)->i_vino = (struct ceph_vino )data;
46		- inode->i_ino = ceph_vino_to_ino((struct ceph_vino )data);
	44	+ struct ceph_inode_info *ci = ceph_inode(inode);
	45	+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
	46	+
	47	+ ci->i_vino = (struct ceph_vino )data;
	48	+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
	49	+ inode_set_iversion_raw(inode, 0);
	50	+ percpu_counter_inc(&mdsc->metric.total_inodes);
	51	+
47	52	return 0;
48	53	}
49	54
50	55	struct inode ceph_get_inode(struct super_block sb, struct ceph_vino vino)
51	56	{
52	57	struct inode *inode;
53		- ino_t t = ceph_vino_to_ino(vino);
54	58
55		- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
	59	+ if (ceph_vino_is_reserved(vino))
	60	+ return ERR_PTR(-EREMOTEIO);
	61	+
	62	+ inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
	63	+ ceph_set_ino_cb, &vino);
56	64	if (!inode)
57	65	return ERR_PTR(-ENOMEM);
58		- if (inode->i_state & I_NEW) {
59		- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
60		- inode, ceph_vinop(inode), (u64)inode->i_ino);
61		- unlock_new_inode(inode);
62		- }
63	66
64		- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
65		- vino.snap, inode);
	67	+ dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
	68	+ ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
66	69	return inode;
67	70	}
68	71
..	..	@@ -84,10 +87,19 @@
84	87	inode->i_mode = parent->i_mode;
85	88	inode->i_uid = parent->i_uid;
86	89	inode->i_gid = parent->i_gid;
87		- inode->i_op = &ceph_snapdir_iops;
88		- inode->i_fop = &ceph_snapdir_fops;
89		- ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
	90	+ inode->i_mtime = parent->i_mtime;
	91	+ inode->i_ctime = parent->i_ctime;
	92	+ inode->i_atime = parent->i_atime;
90	93	ci->i_rbytes = 0;
	94	+ ci->i_btime = ceph_inode(parent)->i_btime;
	95	+
	96	+ if (inode->i_state & I_NEW) {
	97	+ inode->i_op = &ceph_snapdir_iops;
	98	+ inode->i_fop = &ceph_snapdir_fops;
	99	+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
	100	+ unlock_new_inode(inode);
	101	+ }
	102	+
91	103	return inode;
92	104	}
93	105
..	..	@@ -445,6 +457,7 @@
445	457	ci->i_max_files = 0;
446	458
447	459	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
	460	+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
448	461	RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
449	462
450	463	ci->i_fragtree = RB_ROOT;
..	..	@@ -469,13 +482,13 @@
469	482	ci->i_prealloc_cap_flush = NULL;
470	483	INIT_LIST_HEAD(&ci->i_cap_flush_list);
471	484	init_waitqueue_head(&ci->i_cap_wq);
472		- ci->i_hold_caps_min = 0;
473	485	ci->i_hold_caps_max = 0;
474	486	INIT_LIST_HEAD(&ci->i_cap_delay_list);
475	487	INIT_LIST_HEAD(&ci->i_cap_snaps);
476	488	ci->i_head_snapc = NULL;
477	489	ci->i_snap_caps = 0;
478	490
	491	+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
479	492	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
480	493	ci->i_nr_by_mode[i] = 0;
481	494
..	..	@@ -494,10 +507,11 @@
494	507	ci->i_rdcache_ref = 0;
495	508	ci->i_wr_ref = 0;
496	509	ci->i_wb_ref = 0;
	510	+ ci->i_fx_ref = 0;
497	511	ci->i_wrbuffer_ref = 0;
498	512	ci->i_wrbuffer_ref_head = 0;
499	513	atomic_set(&ci->i_filelock_ref, 0);
500		- atomic_set(&ci->i_shared_gen, 0);
	514	+ atomic_set(&ci->i_shared_gen, 1);
501	515	ci->i_rdcache_gen = 0;
502	516	ci->i_rdcache_revoking = 0;
503	517
..	..	@@ -509,19 +523,17 @@
509	523	INIT_LIST_HEAD(&ci->i_snap_realm_item);
510	524	INIT_LIST_HEAD(&ci->i_snap_flush_item);
511	525
512		- INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
513		- INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
514		-
515		- INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
	526	+ INIT_WORK(&ci->i_work, ceph_inode_work);
	527	+ ci->i_work_mask = 0;
	528	+ memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
516	529
517	530	ceph_fscache_inode_init(ci);
518	531
519	532	return &ci->vfs_inode;
520	533	}
521	534
522		-static void ceph_i_callback(struct rcu_head *head)
	535	+void ceph_free_inode(struct inode *inode)
523	536	{
524		- struct inode *inode = container_of(head, struct inode, i_rcu);
525	537	struct ceph_inode_info *ci = ceph_inode(inode);
526	538
527	539	kfree(ci->i_symlink);
..	..	@@ -531,17 +543,20 @@
531	543	void ceph_evict_inode(struct inode *inode)
532	544	{
533	545	struct ceph_inode_info *ci = ceph_inode(inode);
	546	+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
534	547	struct ceph_inode_frag *frag;
535	548	struct rb_node *n;
536	549
537	550	dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
	551	+
	552	+ percpu_counter_dec(&mdsc->metric.total_inodes);
538	553
539	554	truncate_inode_pages_final(&inode->i_data);
540	555	clear_inode(inode);
541	556
542	557	ceph_fscache_unregister_inode_cookie(ci);
543	558
544		- ceph_queue_caps_release(inode);
	559	+ __ceph_remove_caps(ci);
545	560
546	561	if (__ceph_has_any_quota(ci))
547	562	ceph_adjust_quota_realms_count(inode, false);
..	..	@@ -551,18 +566,21 @@
551	566	* caps in i_snap_caps.
552	567	*/
553	568	if (ci->i_snap_realm) {
554		- struct ceph_mds_client *mdsc =
555		- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
556		- struct ceph_snap_realm *realm = ci->i_snap_realm;
557		-
558		- dout(" dropping residual ref to snap realm %p\n", realm);
559		- spin_lock(&realm->inodes_with_caps_lock);
560		- list_del_init(&ci->i_snap_realm_item);
561		- ci->i_snap_realm = NULL;
562		- if (realm->ino == ci->i_vino.ino)
563		- realm->inode = NULL;
564		- spin_unlock(&realm->inodes_with_caps_lock);
565		- ceph_put_snap_realm(mdsc, realm);
	569	+ if (ceph_snap(inode) == CEPH_NOSNAP) {
	570	+ struct ceph_snap_realm *realm = ci->i_snap_realm;
	571	+ dout(" dropping residual ref to snap realm %p\n",
	572	+ realm);
	573	+ spin_lock(&realm->inodes_with_caps_lock);
	574	+ list_del_init(&ci->i_snap_realm_item);
	575	+ ci->i_snap_realm = NULL;
	576	+ if (realm->ino == ci->i_vino.ino)
	577	+ realm->inode = NULL;
	578	+ spin_unlock(&realm->inodes_with_caps_lock);
	579	+ ceph_put_snap_realm(mdsc, realm);
	580	+ } else {
	581	+ ceph_put_snapid_map(mdsc, ci->i_snapid_map);
	582	+ ci->i_snap_realm = NULL;
	583	+ }
566	584	}
567	585
568	586	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
..	..	@@ -579,21 +597,7 @@
579	597	ceph_buffer_put(ci->i_xattrs.prealloc_blob);
580	598
581	599	ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
582		-}
583		-
584		-void ceph_destroy_inode(struct inode *inode)
585		-{
586		- call_rcu(&inode->i_rcu, ceph_i_callback);
587		-}
588		-
589		-int ceph_drop_inode(struct inode *inode)
590		-{
591		- /*
592		- * Positve dentry and corresponding inode are always accompanied
593		- * in MDS reply. So no need to keep inode in the cache after
594		- * dropping all its aliases.
595		- */
596		- return 1;
	600	+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
597	601	}
598	602
599	603	static inline blkcnt_t calc_inode_blocks(u64 size)
..	..	@@ -644,7 +648,7 @@
644	648	if ((issued & (CEPH_CAP_FILE_CACHE\|
645	649	CEPH_CAP_FILE_BUFFER)) \|\|
646	650	mapping_mapped(inode->i_mapping) \|\|
647		- __ceph_caps_file_wanted(ci)) {
	651	+ __ceph_is_file_opened(ci)) {
648	652	ci->i_truncate_pending++;
649	653	queue_trunc = 1;
650	654	}
..	..	@@ -735,14 +739,13 @@
735	739	* Populate an inode based on info from mds. May be called on new or
736	740	* existing inodes.
737	741	*/
738		-static int fill_inode(struct inode inode, struct page locked_page,
739		- struct ceph_mds_reply_info_in *iinfo,
740		- struct ceph_mds_reply_dirfrag *dirinfo,
741		- struct ceph_mds_session *session,
742		- unsigned long ttl_from, int cap_fmode,
743		- struct ceph_cap_reservation *caps_reservation)
	742	+int ceph_fill_inode(struct inode inode, struct page locked_page,
	743	+ struct ceph_mds_reply_info_in *iinfo,
	744	+ struct ceph_mds_reply_dirfrag *dirinfo,
	745	+ struct ceph_mds_session *session, int cap_fmode,
	746	+ struct ceph_cap_reservation *caps_reservation)
744	747	{
745		- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
	748	+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
746	749	struct ceph_mds_reply_inode *info = iinfo->in;
747	750	struct ceph_inode_info *ci = ceph_inode(inode);
748	751	int issued, new_issued, info_caps;
..	..	@@ -757,7 +760,9 @@
757	760	bool new_version = false;
758	761	bool fill_inline = false;
759	762
760		- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
	763	+ lockdep_assert_held(&mdsc->snap_rwsem);
	764	+
	765	+ dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
761	766	inode, ceph_vinop(inode), le64_to_cpu(info->version),
762	767	ci->i_version);
763	768
..	..	@@ -778,13 +783,16 @@
778	783	if (iinfo->xattr_len > 4) {
779	784	xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
780	785	if (!xattr_blob)
781		- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
	786	+ pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
782	787	iinfo->xattr_len);
783	788	}
784	789
785	790	if (iinfo->pool_ns_len > 0)
786	791	pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
787	792	iinfo->pool_ns_len);
	793	+
	794	+ if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
	795	+ ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
788	796
789	797	spin_lock(&ci->i_ceph_lock);
790	798
..	..	@@ -803,6 +811,9 @@
803	811	((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
804	812	le64_to_cpu(info->version) > (ci->i_version & ~1)))
805	813	new_version = true;
	814	+
	815	+ /* Update change_attribute */
	816	+ inode_set_max_iversion_raw(inode, iinfo->change_attr);
806	817
807	818	__ceph_caps_issued(ci, &issued);
808	819	issued \|= __ceph_caps_dirty(ci);
..	..	@@ -827,6 +838,8 @@
827	838	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
828	839	from_kuid(&init_user_ns, inode->i_uid),
829	840	from_kgid(&init_user_ns, inode->i_gid));
	841	+ ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
	842	+ ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
830	843	}
831	844
832	845	if ((new_version \|\| (new_issued & CEPH_CAP_LINK_SHARED)) &&
..	..	@@ -884,6 +897,7 @@
884	897	ci->i_rbytes = le64_to_cpu(info->rbytes);
885	898	ci->i_rfiles = le64_to_cpu(info->rfiles);
886	899	ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
	900	+ ci->i_dir_pin = iinfo->dir_pin;
887	901	ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
888	902	}
889	903	}
..	..	@@ -900,6 +914,7 @@
900	914	iinfo->xattr_data, iinfo->xattr_len);
901	915	ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
902	916	ceph_forget_all_cached_acls(inode);
	917	+ ceph_security_invalidate_secctx(inode);
903	918	xattr_blob = NULL;
904	919	}
905	920
..	..	@@ -914,6 +929,7 @@
914	929	case S_IFBLK:
915	930	case S_IFCHR:
916	931	case S_IFSOCK:
	932	+ inode->i_blkbits = PAGE_SHIFT;
917	933	init_special_inode(inode, inode->i_mode, inode->i_rdev);
918	934	inode->i_op = &ceph_file_iops;
919	935	break;
..	..	@@ -930,8 +946,9 @@
930	946	spin_unlock(&ci->i_ceph_lock);
931	947
932	948	if (symlen != i_size_read(inode)) {
933		- pr_err("fill_inode %llx.%llx BAD symlink "
934		- "size %lld\n", ceph_vinop(inode),
	949	+ pr_err("%s %llx.%llx BAD symlink "
	950	+ "size %lld\n", __func__,
	951	+ ceph_vinop(inode),
935	952	i_size_read(inode));
936	953	i_size_write(inode, symlen);
937	954	inode->i_blocks = calc_inode_blocks(symlen);
..	..	@@ -955,7 +972,7 @@
955	972	inode->i_fop = &ceph_dir_fops;
956	973	break;
957	974	default:
958		- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
	975	+ pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
959	976	ceph_vinop(inode), inode->i_mode);
960	977	}
961	978
..	..	@@ -964,7 +981,7 @@
964	981	if (ceph_snap(inode) == CEPH_NOSNAP) {
965	982	ceph_add_cap(inode, session,
966	983	le64_to_cpu(info->cap.cap_id),
967		- cap_fmode, info_caps,
	984	+ info_caps,
968	985	le32_to_cpu(info->cap.wanted),
969	986	le32_to_cpu(info->cap.seq),
970	987	le32_to_cpu(info->cap.mseq),
..	..	@@ -989,13 +1006,7 @@
989	1006	dout(" %p got snap_caps %s\n", inode,
990	1007	ceph_cap_string(info_caps));
991	1008	ci->i_snap_caps \|= info_caps;
992		- if (cap_fmode >= 0)
993		- __ceph_get_fmode(ci, cap_fmode);
994	1009	}
995		- } else if (cap_fmode >= 0) {
996		- pr_warn("mds issued no caps on %llx.%llx\n",
997		- ceph_vinop(inode));
998		- __ceph_get_fmode(ci, cap_fmode);
999	1010	}
1000	1011
1001	1012	if (iinfo->inline_version > 0 &&
..	..	@@ -1005,6 +1016,13 @@
1005	1016	if (ci->i_inline_version != CEPH_INLINE_NONE &&
1006	1017	(locked_page \|\| (info_caps & cache_caps)))
1007	1018	fill_inline = true;
	1019	+ }
	1020	+
	1021	+ if (cap_fmode >= 0) {
	1022	+ if (!info_caps)
	1023	+ pr_warn("mds issued no caps on %llx.%llx\n",
	1024	+ ceph_vinop(inode));
	1025	+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
1008	1026	}
1009	1027
1010	1028	spin_unlock(&ci->i_ceph_lock);
..	..	@@ -1039,62 +1057,46 @@
1039	1057	}
1040	1058
1041	1059	/*
1042		- * caller should hold session s_mutex.
	1060	+ * caller should hold session s_mutex and dentry->d_lock.
1043	1061	*/
1044		-static void update_dentry_lease(struct dentry *dentry,
1045		- struct ceph_mds_reply_lease *lease,
1046		- struct ceph_mds_session *session,
1047		- unsigned long from_time,
1048		- struct ceph_vino *tgt_vino,
1049		- struct ceph_vino *dir_vino)
	1062	+static void __update_dentry_lease(struct inode dir, struct dentry dentry,
	1063	+ struct ceph_mds_reply_lease *lease,
	1064	+ struct ceph_mds_session *session,
	1065	+ unsigned long from_time,
	1066	+ struct ceph_mds_session **old_lease_session)
1050	1067	{
1051	1068	struct ceph_dentry_info *di = ceph_dentry(dentry);
	1069	+ unsigned mask = le16_to_cpu(lease->mask);
1052	1070	long unsigned duration = le32_to_cpu(lease->duration_ms);
1053	1071	long unsigned ttl = from_time + (duration * HZ) / 1000;
1054	1072	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
1055		- struct inode *dir;
1056		- struct ceph_mds_session *old_lease_session = NULL;
1057	1073
1058		- /*
1059		- * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
1060		- * we expect a negative dentry.
1061		- */
1062		- if (!tgt_vino && d_really_is_positive(dentry))
1063		- return;
1064		-
1065		- if (tgt_vino && (d_really_is_negative(dentry) \|\|
1066		- !ceph_ino_compare(d_inode(dentry), tgt_vino)))
1067		- return;
1068		-
1069		- spin_lock(&dentry->d_lock);
1070	1074	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
1071	1075	dentry, duration, ttl);
1072	1076
1073		- dir = d_inode(dentry->d_parent);
1074		-
1075		- /* make sure parent matches dir_vino */
1076		- if (!ceph_ino_compare(dir, dir_vino))
1077		- goto out_unlock;
1078		-
1079	1077	/* only track leases on regular dentries */
1080	1078	if (ceph_snap(dir) != CEPH_NOSNAP)
1081		- goto out_unlock;
	1079	+ return;
	1080	+
	1081	+ if (mask & CEPH_LEASE_PRIMARY_LINK)
	1082	+ di->flags \|= CEPH_DENTRY_PRIMARY_LINK;
	1083	+ else
	1084	+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
1082	1085
1083	1086	di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
1084		-
1085		- if (duration == 0)
1086		- goto out_unlock;
	1087	+ if (!(mask & CEPH_LEASE_VALID)) {
	1088	+ __ceph_dentry_dir_lease_touch(di);
	1089	+ return;
	1090	+ }
1087	1091
1088	1092	if (di->lease_gen == session->s_cap_gen &&
1089	1093	time_before(ttl, di->time))
1090		- goto out_unlock; /* we already have a newer lease. */
	1094	+ return; /* we already have a newer lease. */
1091	1095
1092	1096	if (di->lease_session && di->lease_session != session) {
1093		- old_lease_session = di->lease_session;
	1097	+ *old_lease_session = di->lease_session;
1094	1098	di->lease_session = NULL;
1095	1099	}
1096		-
1097		- ceph_dentry_lru_touch(dentry);
1098	1100
1099	1101	if (!di->lease_session)
1100	1102	di->lease_session = ceph_get_mds_session(session);
..	..	@@ -1103,18 +1105,75 @@
1103	1105	di->lease_renew_after = half_ttl;
1104	1106	di->lease_renew_from = 0;
1105	1107	di->time = ttl;
	1108	+
	1109	+ __ceph_dentry_lease_touch(di);
	1110	+}
	1111	+
	1112	+static inline void update_dentry_lease(struct inode dir, struct dentry dentry,
	1113	+ struct ceph_mds_reply_lease *lease,
	1114	+ struct ceph_mds_session *session,
	1115	+ unsigned long from_time)
	1116	+{
	1117	+ struct ceph_mds_session *old_lease_session = NULL;
	1118	+ spin_lock(&dentry->d_lock);
	1119	+ __update_dentry_lease(dir, dentry, lease, session, from_time,
	1120	+ &old_lease_session);
	1121	+ spin_unlock(&dentry->d_lock);
	1122	+ ceph_put_mds_session(old_lease_session);
	1123	+}
	1124	+
	1125	+/*
	1126	+ * update dentry lease without having parent inode locked
	1127	+ */
	1128	+static void update_dentry_lease_careful(struct dentry *dentry,
	1129	+ struct ceph_mds_reply_lease *lease,
	1130	+ struct ceph_mds_session *session,
	1131	+ unsigned long from_time,
	1132	+ char *dname, u32 dname_len,
	1133	+ struct ceph_vino *pdvino,
	1134	+ struct ceph_vino *ptvino)
	1135	+
	1136	+{
	1137	+ struct inode *dir;
	1138	+ struct ceph_mds_session *old_lease_session = NULL;
	1139	+
	1140	+ spin_lock(&dentry->d_lock);
	1141	+ /* make sure dentry's name matches target */
	1142	+ if (dentry->d_name.len != dname_len \|\|
	1143	+ memcmp(dentry->d_name.name, dname, dname_len))
	1144	+ goto out_unlock;
	1145	+
	1146	+ dir = d_inode(dentry->d_parent);
	1147	+ /* make sure parent matches dvino */
	1148	+ if (!ceph_ino_compare(dir, pdvino))
	1149	+ goto out_unlock;
	1150	+
	1151	+ /* make sure dentry's inode matches target. NULL ptvino means that
	1152	+ * we expect a negative dentry */
	1153	+ if (ptvino) {
	1154	+ if (d_really_is_negative(dentry))
	1155	+ goto out_unlock;
	1156	+ if (!ceph_ino_compare(d_inode(dentry), ptvino))
	1157	+ goto out_unlock;
	1158	+ } else {
	1159	+ if (d_really_is_positive(dentry))
	1160	+ goto out_unlock;
	1161	+ }
	1162	+
	1163	+ __update_dentry_lease(dir, dentry, lease, session,
	1164	+ from_time, &old_lease_session);
1106	1165	out_unlock:
1107	1166	spin_unlock(&dentry->d_lock);
1108		- if (old_lease_session)
1109		- ceph_put_mds_session(old_lease_session);
	1167	+ ceph_put_mds_session(old_lease_session);
1110	1168	}
1111	1169
1112	1170	/*
1113	1171	* splice a dentry to an inode.
1114	1172	* caller must hold directory i_mutex for this to be safe.
1115	1173	*/
1116		-static struct dentry splice_dentry(struct dentry dn, struct inode *in)
	1174	+static int splice_dentry(struct dentry *pdn, struct inode in)
1117	1175	{
	1176	+ struct dentry dn = pdn;
1118	1177	struct dentry *realdn;
1119	1178
1120	1179	BUG_ON(d_inode(dn));
..	..	@@ -1147,28 +1206,23 @@
1147	1206	if (IS_ERR(realdn)) {
1148	1207	pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
1149	1208	PTR_ERR(realdn), dn, in, ceph_vinop(in));
1150		- dn = realdn;
1151		- /*
1152		- * Caller should release 'dn' in the case of error.
1153		- * If 'req->r_dentry' is passed to this function,
1154		- * caller should leave 'req->r_dentry' untouched.
1155		- */
1156		- goto out;
1157		- } else if (realdn) {
	1209	+ return PTR_ERR(realdn);
	1210	+ }
	1211	+
	1212	+ if (realdn) {
1158	1213	dout("dn %p (%d) spliced with %p (%d) "
1159	1214	"inode %p ino %llx.%llx\n",
1160	1215	dn, d_count(dn),
1161	1216	realdn, d_count(realdn),
1162	1217	d_inode(realdn), ceph_vinop(d_inode(realdn)));
1163	1218	dput(dn);
1164		- dn = realdn;
	1219	+ *pdn = realdn;
1165	1220	} else {
1166	1221	BUG_ON(!ceph_dentry(dn));
1167	1222	dout("dn %p attached to %p ino %llx.%llx\n",
1168	1223	dn, d_inode(dn), ceph_vinop(d_inode(dn)));
1169	1224	}
1170		-out:
1171		- return dn;
	1225	+ return 0;
1172	1226	}
1173	1227
1174	1228	/*
..	..	@@ -1205,17 +1259,18 @@
1205	1259	struct inode *dir = req->r_parent;
1206	1260
1207	1261	if (dir) {
1208		- err = fill_inode(dir, NULL,
1209		- &rinfo->diri, rinfo->dirfrag,
1210		- session, req->r_request_started, -1,
1211		- &req->r_caps_reservation);
	1262	+ err = ceph_fill_inode(dir, NULL, &rinfo->diri,
	1263	+ rinfo->dirfrag, session, -1,
	1264	+ &req->r_caps_reservation);
1212	1265	if (err < 0)
1213	1266	goto done;
1214	1267	} else {
1215	1268	WARN_ON_ONCE(1);
1216	1269	}
1217	1270
1218		- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
	1271	+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
	1272	+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
	1273	+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1219	1274	struct qstr dname;
1220	1275	struct dentry dn, parent;
1221	1276
..	..	@@ -1270,18 +1325,25 @@
1270	1325	err = PTR_ERR(in);
1271	1326	goto done;
1272	1327	}
1273		- req->r_target_inode = in;
1274	1328
1275		- err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
1276		- session, req->r_request_started,
	1329	+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
	1330	+ NULL, session,
1277	1331	(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1278		- rinfo->head->result == 0) ? req->r_fmode : -1,
	1332	+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
	1333	+ rinfo->head->result == 0) ? req->r_fmode : -1,
1279	1334	&req->r_caps_reservation);
1280	1335	if (err < 0) {
1281		- pr_err("fill_inode badness %p %llx.%llx\n",
	1336	+ pr_err("ceph_fill_inode badness %p %llx.%llx\n",
1282	1337	in, ceph_vinop(in));
	1338	+ if (in->i_state & I_NEW)
	1339	+ discard_new_inode(in);
	1340	+ else
	1341	+ iput(in);
1283	1342	goto done;
1284	1343	}
	1344	+ req->r_target_inode = in;
	1345	+ if (in->i_state & I_NEW)
	1346	+ unlock_new_inode(in);
1285	1347	}
1286	1348
1287	1349	/*
..	..	@@ -1353,7 +1415,12 @@
1353	1415	dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1354	1416	ceph_dentry(req->r_old_dentry)->offset);
1355	1417
1356		- dn = req->r_old_dentry; /* use old_dentry */
	1418	+ /* swap r_dentry and r_old_dentry in case that
	1419	+ * splice_dentry() gets called later. This is safe
	1420	+ * because no other place will use them */
	1421	+ req->r_dentry = req->r_old_dentry;
	1422	+ req->r_old_dentry = dn;
	1423	+ dn = req->r_dentry;
1357	1424	}
1358	1425
1359	1426	/* null dentry? */
..	..	@@ -1366,10 +1433,9 @@
1366	1433	} else if (have_lease) {
1367	1434	if (d_unhashed(dn))
1368	1435	d_add(dn, NULL);
1369		- update_dentry_lease(dn, rinfo->dlease,
1370		- session,
1371		- req->r_request_started,
1372		- NULL, &dvino);
	1436	+ update_dentry_lease(dir, dn,
	1437	+ rinfo->dlease, session,
	1438	+ req->r_request_started);
1373	1439	}
1374	1440	goto done;
1375	1441	}
..	..	@@ -1378,12 +1444,10 @@
1378	1444	if (d_really_is_negative(dn)) {
1379	1445	ceph_dir_clear_ordered(dir);
1380	1446	ihold(in);
1381		- dn = splice_dentry(dn, in);
1382		- if (IS_ERR(dn)) {
1383		- err = PTR_ERR(dn);
	1447	+ err = splice_dentry(&req->r_dentry, in);
	1448	+ if (err < 0)
1384	1449	goto done;
1385		- }
1386		- req->r_dentry = dn; /* may have spliced */
	1450	+ dn = req->r_dentry; /* may have spliced */
1387	1451	} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
1388	1452	dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1389	1453	dn, d_inode(dn), ceph_vinop(d_inode(dn)),
..	..	@@ -1393,53 +1457,41 @@
1393	1457	}
1394	1458
1395	1459	if (have_lease) {
1396		- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1397		- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1398		- update_dentry_lease(dn, rinfo->dlease, session,
1399		- req->r_request_started,
1400		- &tvino, &dvino);
	1460	+ update_dentry_lease(dir, dn,
	1461	+ rinfo->dlease, session,
	1462	+ req->r_request_started);
1401	1463	}
1402	1464	dout(" final dn %p\n", dn);
1403	1465	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP \|\|
1404	1466	req->r_op == CEPH_MDS_OP_MKSNAP) &&
1405	1467	test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1406	1468	!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1407		- struct dentry *dn = req->r_dentry;
1408	1469	struct inode *dir = req->r_parent;
1409	1470
1410	1471	/* fill out a snapdir LOOKUPSNAP dentry */
1411		- BUG_ON(!dn);
1412	1472	BUG_ON(!dir);
1413	1473	BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
1414		- dout(" linking snapped dir %p to dn %p\n", in, dn);
	1474	+ BUG_ON(!req->r_dentry);
	1475	+ dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
1415	1476	ceph_dir_clear_ordered(dir);
1416	1477	ihold(in);
1417		- dn = splice_dentry(dn, in);
1418		- if (IS_ERR(dn)) {
1419		- err = PTR_ERR(dn);
	1478	+ err = splice_dentry(&req->r_dentry, in);
	1479	+ if (err < 0)
1420	1480	goto done;
1421		- }
1422		- req->r_dentry = dn; /* may have spliced */
1423		- } else if (rinfo->head->is_dentry) {
	1481	+ } else if (rinfo->head->is_dentry && req->r_dentry) {
	1482	+ /* parent inode is not locked, be carefull */
1424	1483	struct ceph_vino *ptvino = NULL;
1425		-
1426		- if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) \|\|
1427		- le32_to_cpu(rinfo->dlease->duration_ms)) {
1428		- dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1429		- dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1430		-
1431		- if (rinfo->head->is_target) {
1432		- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1433		- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1434		- ptvino = &tvino;
1435		- }
1436		-
1437		- update_dentry_lease(req->r_dentry, rinfo->dlease,
1438		- session, req->r_request_started, ptvino,
1439		- &dvino);
1440		- } else {
1441		- dout("%s: no dentry lease or dir cap\n", __func__);
	1484	+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
	1485	+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
	1486	+ if (rinfo->head->is_target) {
	1487	+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
	1488	+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
	1489	+ ptvino = &tvino;
1442	1490	}
	1491	+ update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
	1492	+ session, req->r_request_started,
	1493	+ rinfo->dname, rinfo->dname_len,
	1494	+ &dvino, ptvino);
1443	1495	}
1444	1496	done:
1445	1497	dout("fill_trace done err=%d\n", err);
..	..	@@ -1470,14 +1522,22 @@
1470	1522	dout("new_inode badness got %d\n", err);
1471	1523	continue;
1472	1524	}
1473		- rc = fill_inode(in, NULL, &rde->inode, NULL, session,
1474		- req->r_request_started, -1,
1475		- &req->r_caps_reservation);
	1525	+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
	1526	+ -1, &req->r_caps_reservation);
1476	1527	if (rc < 0) {
1477		- pr_err("fill_inode badness on %p got %d\n", in, rc);
	1528	+ pr_err("ceph_fill_inode badness on %p got %d\n",
	1529	+ in, rc);
1478	1530	err = rc;
	1531	+ if (in->i_state & I_NEW) {
	1532	+ ihold(in);
	1533	+ discard_new_inode(in);
	1534	+ }
	1535	+ } else if (in->i_state & I_NEW) {
	1536	+ unlock_new_inode(in);
1479	1537	}
1480		- iput(in);
	1538	+
	1539	+ /* avoid calling iput_final() in mds dispatch threads */
	1540	+ ceph_async_iput(in);
1481	1541	}
1482	1542
1483	1543	return err;
..	..	@@ -1600,7 +1660,7 @@
1600	1660	/* FIXME: release caps/leases if error occurs */
1601	1661	for (i = 0; i < rinfo->dir_nr; i++) {
1602	1662	struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1603		- struct ceph_vino tvino, dvino;
	1663	+ struct ceph_vino tvino;
1604	1664
1605	1665	dname.name = rde->name;
1606	1666	dname.len = rde->name_len;
..	..	@@ -1670,43 +1730,45 @@
1670	1730	}
1671	1731	}
1672	1732
1673		- ret = fill_inode(in, NULL, &rde->inode, NULL, session,
1674		- req->r_request_started, -1,
1675		- &req->r_caps_reservation);
	1733	+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
	1734	+ -1, &req->r_caps_reservation);
1676	1735	if (ret < 0) {
1677		- pr_err("fill_inode badness on %p\n", in);
1678		- if (d_really_is_negative(dn))
1679		- iput(in);
	1736	+ pr_err("ceph_fill_inode badness on %p\n", in);
	1737	+ if (d_really_is_negative(dn)) {
	1738	+ /* avoid calling iput_final() in mds
	1739	+ * dispatch threads */
	1740	+ if (in->i_state & I_NEW) {
	1741	+ ihold(in);
	1742	+ discard_new_inode(in);
	1743	+ }
	1744	+ ceph_async_iput(in);
	1745	+ }
1680	1746	d_drop(dn);
1681	1747	err = ret;
1682	1748	goto next_item;
1683	1749	}
	1750	+ if (in->i_state & I_NEW)
	1751	+ unlock_new_inode(in);
1684	1752
1685	1753	if (d_really_is_negative(dn)) {
1686		- struct dentry *realdn;
1687		-
1688	1754	if (ceph_security_xattr_deadlock(in)) {
1689	1755	dout(" skip splicing dn %p to inode %p"
1690	1756	" (security xattr deadlock)\n", dn, in);
1691		- iput(in);
	1757	+ ceph_async_iput(in);
1692	1758	skipped++;
1693	1759	goto next_item;
1694	1760	}
1695	1761
1696		- realdn = splice_dentry(dn, in);
1697		- if (IS_ERR(realdn)) {
1698		- err = PTR_ERR(realdn);
1699		- d_drop(dn);
	1762	+ err = splice_dentry(&dn, in);
	1763	+ if (err < 0)
1700	1764	goto next_item;
1701		- }
1702		- dn = realdn;
1703	1765	}
1704	1766
1705	1767	ceph_dentry(dn)->offset = rde->offset;
1706	1768
1707		- dvino = ceph_vino(d_inode(parent));
1708		- update_dentry_lease(dn, rde->lease, req->r_session,
1709		- req->r_request_started, &tvino, &dvino);
	1769	+ update_dentry_lease(d_inode(parent), dn,
	1770	+ rde->lease, req->r_session,
	1771	+ req->r_request_started);
1710	1772
1711	1773	if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
1712	1774	ret = fill_readdir_cache(d_inode(parent), dn,
..	..	@@ -1715,8 +1777,7 @@
1715	1777	err = ret;
1716	1778	}
1717	1779	next_item:
1718		- if (dn)
1719		- dput(dn);
	1780	+ dput(dn);
1720	1781	}
1721	1782	out:
1722	1783	if (err == 0 && skipped == 0) {
..	..	@@ -1745,30 +1806,42 @@
1745	1806	}
1746	1807
1747	1808	/*
	1809	+ * Put reference to inode, but avoid calling iput_final() in current thread.
	1810	+ * iput_final() may wait for reahahead pages. The wait can cause deadlock in
	1811	+ * some contexts.
	1812	+ */
	1813	+void ceph_async_iput(struct inode *inode)
	1814	+{
	1815	+ if (!inode)
	1816	+ return;
	1817	+ for (;;) {
	1818	+ if (atomic_add_unless(&inode->i_count, -1, 1))
	1819	+ break;
	1820	+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
	1821	+ &ceph_inode(inode)->i_work))
	1822	+ break;
	1823	+ /* queue work failed, i_count must be at least 2 */
	1824	+ }
	1825	+}
	1826	+
	1827	+/*
1748	1828	* Write back inode data in a worker thread. (This can't be done
1749	1829	* in the message handler context.)
1750	1830	*/
1751	1831	void ceph_queue_writeback(struct inode *inode)
1752	1832	{
	1833	+ struct ceph_inode_info *ci = ceph_inode(inode);
	1834	+ set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
	1835	+
1753	1836	ihold(inode);
1754		- if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1755		- &ceph_inode(inode)->i_wb_work)) {
	1837	+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
	1838	+ &ci->i_work)) {
1756	1839	dout("ceph_queue_writeback %p\n", inode);
1757	1840	} else {
1758		- dout("ceph_queue_writeback %p failed\n", inode);
	1841	+ dout("ceph_queue_writeback %p already queued, mask=%lx\n",
	1842	+ inode, ci->i_work_mask);
1759	1843	iput(inode);
1760	1844	}
1761		-}
1762		-
1763		-static void ceph_writeback_work(struct work_struct *work)
1764		-{
1765		- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1766		- i_wb_work);
1767		- struct inode *inode = &ci->vfs_inode;
1768		-
1769		- dout("writeback %p\n", inode);
1770		- filemap_fdatawrite(&inode->i_data);
1771		- iput(inode);
1772	1845	}
1773	1846
1774	1847	/*
..	..	@@ -1776,25 +1849,43 @@
1776	1849	*/
1777	1850	void ceph_queue_invalidate(struct inode *inode)
1778	1851	{
	1852	+ struct ceph_inode_info *ci = ceph_inode(inode);
	1853	+ set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
	1854	+
1779	1855	ihold(inode);
1780		- if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1781		- &ceph_inode(inode)->i_pg_inv_work)) {
	1856	+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
	1857	+ &ceph_inode(inode)->i_work)) {
1782	1858	dout("ceph_queue_invalidate %p\n", inode);
1783	1859	} else {
1784		- dout("ceph_queue_invalidate %p failed\n", inode);
	1860	+ dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
	1861	+ inode, ci->i_work_mask);
1785	1862	iput(inode);
1786	1863	}
1787	1864	}
1788	1865
1789	1866	/*
1790		- * Invalidate inode pages in a worker thread. (This can't be done
1791		- * in the message handler context.)
	1867	+ * Queue an async vmtruncate. If we fail to queue work, we will handle
	1868	+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
1792	1869	*/
1793		-static void ceph_invalidate_work(struct work_struct *work)
	1870	+void ceph_queue_vmtruncate(struct inode *inode)
1794	1871	{
1795		- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1796		- i_pg_inv_work);
1797		- struct inode *inode = &ci->vfs_inode;
	1872	+ struct ceph_inode_info *ci = ceph_inode(inode);
	1873	+ set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
	1874	+
	1875	+ ihold(inode);
	1876	+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
	1877	+ &ci->i_work)) {
	1878	+ dout("ceph_queue_vmtruncate %p\n", inode);
	1879	+ } else {
	1880	+ dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
	1881	+ inode, ci->i_work_mask);
	1882	+ iput(inode);
	1883	+ }
	1884	+}
	1885	+
	1886	+static void ceph_do_invalidate_pages(struct inode *inode)
	1887	+{
	1888	+ struct ceph_inode_info *ci = ceph_inode(inode);
1798	1889	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1799	1890	u32 orig_gen;
1800	1891	int check = 0;
..	..	@@ -1847,44 +1938,6 @@
1847	1938	out:
1848	1939	if (check)
1849	1940	ceph_check_caps(ci, 0, NULL);
1850		- iput(inode);
1851		-}
1852		-
1853		-
1854		-/*
1855		- * called by trunc_wq;
1856		- *
1857		- * We also truncate in a separate thread as well.
1858		- */
1859		-static void ceph_vmtruncate_work(struct work_struct *work)
1860		-{
1861		- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1862		- i_vmtruncate_work);
1863		- struct inode *inode = &ci->vfs_inode;
1864		-
1865		- dout("vmtruncate_work %p\n", inode);
1866		- __ceph_do_pending_vmtruncate(inode);
1867		- iput(inode);
1868		-}
1869		-
1870		-/*
1871		- * Queue an async vmtruncate. If we fail to queue work, we will handle
1872		- * the truncation the next time we call __ceph_do_pending_vmtruncate.
1873		- */
1874		-void ceph_queue_vmtruncate(struct inode *inode)
1875		-{
1876		- struct ceph_inode_info *ci = ceph_inode(inode);
1877		-
1878		- ihold(inode);
1879		-
1880		- if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1881		- &ci->i_vmtruncate_work)) {
1882		- dout("ceph_queue_vmtruncate %p\n", inode);
1883		- } else {
1884		- dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1885		- inode, ci->i_truncate_pending);
1886		- iput(inode);
1887		- }
1888	1941	}
1889	1942
1890	1943	/*
..	..	@@ -1943,9 +1996,28 @@
1943	1996	mutex_unlock(&ci->i_truncate_mutex);
1944	1997
1945	1998	if (wrbuffer_refs == 0)
1946		- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
	1999	+ ceph_check_caps(ci, 0, NULL);
1947	2000
1948	2001	wake_up_all(&ci->i_cap_wq);
	2002	+}
	2003	+
	2004	+static void ceph_inode_work(struct work_struct *work)
	2005	+{
	2006	+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
	2007	+ i_work);
	2008	+ struct inode *inode = &ci->vfs_inode;
	2009	+
	2010	+ if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
	2011	+ dout("writeback %p\n", inode);
	2012	+ filemap_fdatawrite(&inode->i_data);
	2013	+ }
	2014	+ if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
	2015	+ ceph_do_invalidate_pages(inode);
	2016	+
	2017	+ if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
	2018	+ __ceph_do_pending_vmtruncate(inode);
	2019	+
	2020	+ iput(inode);
1949	2021	}
1950	2022
1951	2023	/*
..	..	@@ -1961,7 +2033,7 @@
1961	2033	int __ceph_setattr(struct inode inode, struct iattr attr)
1962	2034	{
1963	2035	struct ceph_inode_info *ci = ceph_inode(inode);
1964		- const unsigned int ia_valid = attr->ia_valid;
	2036	+ unsigned int ia_valid = attr->ia_valid;
1965	2037	struct ceph_mds_request *req;
1966	2038	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1967	2039	struct ceph_cap_flush *prealloc_cf;
..	..	@@ -2066,6 +2138,26 @@
2066	2138	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2067	2139	}
2068	2140	}
	2141	+ if (ia_valid & ATTR_SIZE) {
	2142	+ dout("setattr %p size %lld -> %lld\n", inode,
	2143	+ inode->i_size, attr->ia_size);
	2144	+ if ((issued & CEPH_CAP_FILE_EXCL) &&
	2145	+ attr->ia_size > inode->i_size) {
	2146	+ i_size_write(inode, attr->ia_size);
	2147	+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
	2148	+ ci->i_reported_size = attr->ia_size;
	2149	+ dirtied \|= CEPH_CAP_FILE_EXCL;
	2150	+ ia_valid \|= ATTR_MTIME;
	2151	+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 \|\|
	2152	+ attr->ia_size != inode->i_size) {
	2153	+ req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
	2154	+ req->r_args.setattr.old_size =
	2155	+ cpu_to_le64(inode->i_size);
	2156	+ mask \|= CEPH_SETATTR_SIZE;
	2157	+ release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
	2158	+ CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
	2159	+ }
	2160	+ }
2069	2161	if (ia_valid & ATTR_MTIME) {
2070	2162	dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
2071	2163	inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
..	..	@@ -2085,25 +2177,6 @@
2085	2177	&attr->ia_mtime);
2086	2178	mask \|= CEPH_SETATTR_MTIME;
2087	2179	release \|= CEPH_CAP_FILE_SHARED \|
2088		- CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2089		- }
2090		- }
2091		- if (ia_valid & ATTR_SIZE) {
2092		- dout("setattr %p size %lld -> %lld\n", inode,
2093		- inode->i_size, attr->ia_size);
2094		- if ((issued & CEPH_CAP_FILE_EXCL) &&
2095		- attr->ia_size > inode->i_size) {
2096		- i_size_write(inode, attr->ia_size);
2097		- inode->i_blocks = calc_inode_blocks(attr->ia_size);
2098		- ci->i_reported_size = attr->ia_size;
2099		- dirtied \|= CEPH_CAP_FILE_EXCL;
2100		- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 \|\|
2101		- attr->ia_size != inode->i_size) {
2102		- req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2103		- req->r_args.setattr.old_size =
2104		- cpu_to_le64(inode->i_size);
2105		- mask \|= CEPH_SETATTR_SIZE;
2106		- release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
2107	2180	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2108	2181	}
2109	2182	}
..	..	@@ -2223,8 +2296,8 @@
2223	2296
2224	2297	dout("do_getattr inode %p mask %s mode 0%o\n",
2225	2298	inode, ceph_cap_string(mask), inode->i_mode);
2226		- if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
2227		- return 0;
	2299	+ if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
	2300	+ return 0;
2228	2301
2229	2302	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
2230	2303	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
..	..	@@ -2271,42 +2344,82 @@
2271	2344	return err;
2272	2345	}
2273	2346
	2347	+/* Craft a mask of needed caps given a set of requested statx attrs. */
	2348	+static int statx_to_caps(u32 want)
	2349	+{
	2350	+ int mask = 0;
	2351	+
	2352	+ if (want & (STATX_MODE\|STATX_UID\|STATX_GID\|STATX_CTIME\|STATX_BTIME))
	2353	+ mask \|= CEPH_CAP_AUTH_SHARED;
	2354	+
	2355	+ if (want & (STATX_NLINK\|STATX_CTIME))
	2356	+ mask \|= CEPH_CAP_LINK_SHARED;
	2357	+
	2358	+ if (want & (STATX_ATIME\|STATX_MTIME\|STATX_CTIME\|STATX_SIZE\|
	2359	+ STATX_BLOCKS))
	2360	+ mask \|= CEPH_CAP_FILE_SHARED;
	2361	+
	2362	+ if (want & (STATX_CTIME))
	2363	+ mask \|= CEPH_CAP_XATTR_SHARED;
	2364	+
	2365	+ return mask;
	2366	+}
	2367	+
2274	2368	/*
2275		- * Get all attributes. Hopefully somedata we'll have a statlite()
2276		- * and can limit the fields we require to be accurate.
	2369	+ * Get all the attributes. If we have sufficient caps for the requested attrs,
	2370	+ * then we can avoid talking to the MDS at all.
2277	2371	*/
2278	2372	int ceph_getattr(const struct path path, struct kstat stat,
2279	2373	u32 request_mask, unsigned int flags)
2280	2374	{
2281	2375	struct inode *inode = d_inode(path->dentry);
2282	2376	struct ceph_inode_info *ci = ceph_inode(inode);
2283		- int err;
	2377	+ u32 valid_mask = STATX_BASIC_STATS;
	2378	+ int err = 0;
2284	2379
2285		- err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
2286		- if (!err) {
2287		- generic_fillattr(inode, stat);
2288		- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
2289		- if (ceph_snap(inode) != CEPH_NOSNAP)
2290		- stat->dev = ceph_snap(inode);
2291		- else
2292		- stat->dev = 0;
2293		- if (S_ISDIR(inode->i_mode)) {
2294		- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
2295		- RBYTES))
2296		- stat->size = ci->i_rbytes;
2297		- else
2298		- stat->size = ci->i_files + ci->i_subdirs;
2299		- stat->blocks = 0;
2300		- stat->blksize = 65536;
2301		- /*
2302		- * Some applications rely on the number of st_nlink
2303		- * value on directories to be either 0 (if unlinked)
2304		- * or 2 + number of subdirectories.
2305		- */
2306		- if (stat->nlink == 1)
2307		- /* '.' + '..' + subdirs */
2308		- stat->nlink = 1 + 1 + ci->i_subdirs;
2309		- }
	2380	+ /* Skip the getattr altogether if we're asked not to sync */
	2381	+ if (!(flags & AT_STATX_DONT_SYNC)) {
	2382	+ err = ceph_do_getattr(inode, statx_to_caps(request_mask),
	2383	+ flags & AT_STATX_FORCE_SYNC);
	2384	+ if (err)
	2385	+ return err;
2310	2386	}
	2387	+
	2388	+ generic_fillattr(inode, stat);
	2389	+ stat->ino = ceph_present_inode(inode);
	2390	+
	2391	+ /*
	2392	+ * btime on newly-allocated inodes is 0, so if this is still set to
	2393	+ * that, then assume that it's not valid.
	2394	+ */
	2395	+ if (ci->i_btime.tv_sec \|\| ci->i_btime.tv_nsec) {
	2396	+ stat->btime = ci->i_btime;
	2397	+ valid_mask \|= STATX_BTIME;
	2398	+ }
	2399	+
	2400	+ if (ceph_snap(inode) == CEPH_NOSNAP)
	2401	+ stat->dev = inode->i_sb->s_dev;
	2402	+ else
	2403	+ stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
	2404	+
	2405	+ if (S_ISDIR(inode->i_mode)) {
	2406	+ if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
	2407	+ RBYTES))
	2408	+ stat->size = ci->i_rbytes;
	2409	+ else
	2410	+ stat->size = ci->i_files + ci->i_subdirs;
	2411	+ stat->blocks = 0;
	2412	+ stat->blksize = 65536;
	2413	+ /*
	2414	+ * Some applications rely on the number of st_nlink
	2415	+ * value on directories to be either 0 (if unlinked)
	2416	+ * or 2 + number of subdirectories.
	2417	+ */
	2418	+ if (stat->nlink == 1)
	2419	+ /* '.' + '..' + subdirs */
	2420	+ stat->nlink = 1 + 1 + ci->i_subdirs;
	2421	+ }
	2422	+
	2423	+ stat->result_mask = request_mask & valid_mask;
2311	2424	return err;
2312	2425	}