hc
2024-09-20 a36159eec6ca17402b0e146b86efaf76568dc353
kernel/fs/ceph/inode.c
....@@ -13,6 +13,7 @@
1313 #include <linux/posix_acl.h>
1414 #include <linux/random.h>
1515 #include <linux/sort.h>
16
+#include <linux/iversion.h>
1617
1718 #include "super.h"
1819 #include "mds_client.h"
....@@ -33,36 +34,38 @@
3334
3435 static const struct inode_operations ceph_symlink_iops;
3536
36
-static void ceph_invalidate_work(struct work_struct *work);
37
-static void ceph_writeback_work(struct work_struct *work);
38
-static void ceph_vmtruncate_work(struct work_struct *work);
37
+static void ceph_inode_work(struct work_struct *work);
3938
4039 /*
4140 * find or create an inode, given the ceph ino number
4241 */
4342 static int ceph_set_ino_cb(struct inode *inode, void *data)
4443 {
45
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
46
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
44
+ struct ceph_inode_info *ci = ceph_inode(inode);
45
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
46
+
47
+ ci->i_vino = *(struct ceph_vino *)data;
48
+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
49
+ inode_set_iversion_raw(inode, 0);
50
+ percpu_counter_inc(&mdsc->metric.total_inodes);
51
+
4752 return 0;
4853 }
4954
5055 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
5156 {
5257 struct inode *inode;
53
- ino_t t = ceph_vino_to_ino(vino);
5458
55
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
59
+ if (ceph_vino_is_reserved(vino))
60
+ return ERR_PTR(-EREMOTEIO);
61
+
62
+ inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
63
+ ceph_set_ino_cb, &vino);
5664 if (!inode)
5765 return ERR_PTR(-ENOMEM);
58
- if (inode->i_state & I_NEW) {
59
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
60
- inode, ceph_vinop(inode), (u64)inode->i_ino);
61
- unlock_new_inode(inode);
62
- }
6366
64
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
65
- vino.snap, inode);
67
+ dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
68
+ ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
6669 return inode;
6770 }
6871
....@@ -84,10 +87,19 @@
8487 inode->i_mode = parent->i_mode;
8588 inode->i_uid = parent->i_uid;
8689 inode->i_gid = parent->i_gid;
87
- inode->i_op = &ceph_snapdir_iops;
88
- inode->i_fop = &ceph_snapdir_fops;
89
- ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
90
+ inode->i_mtime = parent->i_mtime;
91
+ inode->i_ctime = parent->i_ctime;
92
+ inode->i_atime = parent->i_atime;
9093 ci->i_rbytes = 0;
94
+ ci->i_btime = ceph_inode(parent)->i_btime;
95
+
96
+ if (inode->i_state & I_NEW) {
97
+ inode->i_op = &ceph_snapdir_iops;
98
+ inode->i_fop = &ceph_snapdir_fops;
99
+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
100
+ unlock_new_inode(inode);
101
+ }
102
+
91103 return inode;
92104 }
93105
....@@ -445,6 +457,7 @@
445457 ci->i_max_files = 0;
446458
447459 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
460
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
448461 RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
449462
450463 ci->i_fragtree = RB_ROOT;
....@@ -469,13 +482,13 @@
469482 ci->i_prealloc_cap_flush = NULL;
470483 INIT_LIST_HEAD(&ci->i_cap_flush_list);
471484 init_waitqueue_head(&ci->i_cap_wq);
472
- ci->i_hold_caps_min = 0;
473485 ci->i_hold_caps_max = 0;
474486 INIT_LIST_HEAD(&ci->i_cap_delay_list);
475487 INIT_LIST_HEAD(&ci->i_cap_snaps);
476488 ci->i_head_snapc = NULL;
477489 ci->i_snap_caps = 0;
478490
491
+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
479492 for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
480493 ci->i_nr_by_mode[i] = 0;
481494
....@@ -494,10 +507,11 @@
494507 ci->i_rdcache_ref = 0;
495508 ci->i_wr_ref = 0;
496509 ci->i_wb_ref = 0;
510
+ ci->i_fx_ref = 0;
497511 ci->i_wrbuffer_ref = 0;
498512 ci->i_wrbuffer_ref_head = 0;
499513 atomic_set(&ci->i_filelock_ref, 0);
500
- atomic_set(&ci->i_shared_gen, 0);
514
+ atomic_set(&ci->i_shared_gen, 1);
501515 ci->i_rdcache_gen = 0;
502516 ci->i_rdcache_revoking = 0;
503517
....@@ -509,19 +523,17 @@
509523 INIT_LIST_HEAD(&ci->i_snap_realm_item);
510524 INIT_LIST_HEAD(&ci->i_snap_flush_item);
511525
512
- INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
513
- INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
514
-
515
- INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
526
+ INIT_WORK(&ci->i_work, ceph_inode_work);
527
+ ci->i_work_mask = 0;
528
+ memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
516529
517530 ceph_fscache_inode_init(ci);
518531
519532 return &ci->vfs_inode;
520533 }
521534
522
-static void ceph_i_callback(struct rcu_head *head)
535
+void ceph_free_inode(struct inode *inode)
523536 {
524
- struct inode *inode = container_of(head, struct inode, i_rcu);
525537 struct ceph_inode_info *ci = ceph_inode(inode);
526538
527539 kfree(ci->i_symlink);
....@@ -531,17 +543,20 @@
531543 void ceph_evict_inode(struct inode *inode)
532544 {
533545 struct ceph_inode_info *ci = ceph_inode(inode);
546
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
534547 struct ceph_inode_frag *frag;
535548 struct rb_node *n;
536549
537550 dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
551
+
552
+ percpu_counter_dec(&mdsc->metric.total_inodes);
538553
539554 truncate_inode_pages_final(&inode->i_data);
540555 clear_inode(inode);
541556
542557 ceph_fscache_unregister_inode_cookie(ci);
543558
544
- ceph_queue_caps_release(inode);
559
+ __ceph_remove_caps(ci);
545560
546561 if (__ceph_has_any_quota(ci))
547562 ceph_adjust_quota_realms_count(inode, false);
....@@ -551,18 +566,21 @@
551566 * caps in i_snap_caps.
552567 */
553568 if (ci->i_snap_realm) {
554
- struct ceph_mds_client *mdsc =
555
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
556
- struct ceph_snap_realm *realm = ci->i_snap_realm;
557
-
558
- dout(" dropping residual ref to snap realm %p\n", realm);
559
- spin_lock(&realm->inodes_with_caps_lock);
560
- list_del_init(&ci->i_snap_realm_item);
561
- ci->i_snap_realm = NULL;
562
- if (realm->ino == ci->i_vino.ino)
563
- realm->inode = NULL;
564
- spin_unlock(&realm->inodes_with_caps_lock);
565
- ceph_put_snap_realm(mdsc, realm);
569
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
570
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
571
+ dout(" dropping residual ref to snap realm %p\n",
572
+ realm);
573
+ spin_lock(&realm->inodes_with_caps_lock);
574
+ list_del_init(&ci->i_snap_realm_item);
575
+ ci->i_snap_realm = NULL;
576
+ if (realm->ino == ci->i_vino.ino)
577
+ realm->inode = NULL;
578
+ spin_unlock(&realm->inodes_with_caps_lock);
579
+ ceph_put_snap_realm(mdsc, realm);
580
+ } else {
581
+ ceph_put_snapid_map(mdsc, ci->i_snapid_map);
582
+ ci->i_snap_realm = NULL;
583
+ }
566584 }
567585
568586 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
....@@ -579,21 +597,7 @@
579597 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
580598
581599 ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
582
-}
583
-
584
-void ceph_destroy_inode(struct inode *inode)
585
-{
586
- call_rcu(&inode->i_rcu, ceph_i_callback);
587
-}
588
-
589
-int ceph_drop_inode(struct inode *inode)
590
-{
591
- /*
592
- * Positve dentry and corresponding inode are always accompanied
593
- * in MDS reply. So no need to keep inode in the cache after
594
- * dropping all its aliases.
595
- */
596
- return 1;
600
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
597601 }
598602
599603 static inline blkcnt_t calc_inode_blocks(u64 size)
....@@ -644,7 +648,7 @@
644648 if ((issued & (CEPH_CAP_FILE_CACHE|
645649 CEPH_CAP_FILE_BUFFER)) ||
646650 mapping_mapped(inode->i_mapping) ||
647
- __ceph_caps_file_wanted(ci)) {
651
+ __ceph_is_file_opened(ci)) {
648652 ci->i_truncate_pending++;
649653 queue_trunc = 1;
650654 }
....@@ -735,14 +739,13 @@
735739 * Populate an inode based on info from mds. May be called on new or
736740 * existing inodes.
737741 */
738
-static int fill_inode(struct inode *inode, struct page *locked_page,
739
- struct ceph_mds_reply_info_in *iinfo,
740
- struct ceph_mds_reply_dirfrag *dirinfo,
741
- struct ceph_mds_session *session,
742
- unsigned long ttl_from, int cap_fmode,
743
- struct ceph_cap_reservation *caps_reservation)
742
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
743
+ struct ceph_mds_reply_info_in *iinfo,
744
+ struct ceph_mds_reply_dirfrag *dirinfo,
745
+ struct ceph_mds_session *session, int cap_fmode,
746
+ struct ceph_cap_reservation *caps_reservation)
744747 {
745
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
748
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
746749 struct ceph_mds_reply_inode *info = iinfo->in;
747750 struct ceph_inode_info *ci = ceph_inode(inode);
748751 int issued, new_issued, info_caps;
....@@ -757,7 +760,9 @@
757760 bool new_version = false;
758761 bool fill_inline = false;
759762
760
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
763
+ lockdep_assert_held(&mdsc->snap_rwsem);
764
+
765
+ dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
761766 inode, ceph_vinop(inode), le64_to_cpu(info->version),
762767 ci->i_version);
763768
....@@ -778,13 +783,16 @@
778783 if (iinfo->xattr_len > 4) {
779784 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
780785 if (!xattr_blob)
781
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
786
+ pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
782787 iinfo->xattr_len);
783788 }
784789
785790 if (iinfo->pool_ns_len > 0)
786791 pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
787792 iinfo->pool_ns_len);
793
+
794
+ if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
795
+ ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
788796
789797 spin_lock(&ci->i_ceph_lock);
790798
....@@ -803,6 +811,9 @@
803811 ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
804812 le64_to_cpu(info->version) > (ci->i_version & ~1)))
805813 new_version = true;
814
+
815
+ /* Update change_attribute */
816
+ inode_set_max_iversion_raw(inode, iinfo->change_attr);
806817
807818 __ceph_caps_issued(ci, &issued);
808819 issued |= __ceph_caps_dirty(ci);
....@@ -827,6 +838,8 @@
827838 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
828839 from_kuid(&init_user_ns, inode->i_uid),
829840 from_kgid(&init_user_ns, inode->i_gid));
841
+ ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
842
+ ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
830843 }
831844
832845 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
....@@ -884,6 +897,7 @@
884897 ci->i_rbytes = le64_to_cpu(info->rbytes);
885898 ci->i_rfiles = le64_to_cpu(info->rfiles);
886899 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
900
+ ci->i_dir_pin = iinfo->dir_pin;
887901 ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
888902 }
889903 }
....@@ -900,6 +914,7 @@
900914 iinfo->xattr_data, iinfo->xattr_len);
901915 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
902916 ceph_forget_all_cached_acls(inode);
917
+ ceph_security_invalidate_secctx(inode);
903918 xattr_blob = NULL;
904919 }
905920
....@@ -914,6 +929,7 @@
914929 case S_IFBLK:
915930 case S_IFCHR:
916931 case S_IFSOCK:
932
+ inode->i_blkbits = PAGE_SHIFT;
917933 init_special_inode(inode, inode->i_mode, inode->i_rdev);
918934 inode->i_op = &ceph_file_iops;
919935 break;
....@@ -930,8 +946,9 @@
930946 spin_unlock(&ci->i_ceph_lock);
931947
932948 if (symlen != i_size_read(inode)) {
933
- pr_err("fill_inode %llx.%llx BAD symlink "
934
- "size %lld\n", ceph_vinop(inode),
949
+ pr_err("%s %llx.%llx BAD symlink "
950
+ "size %lld\n", __func__,
951
+ ceph_vinop(inode),
935952 i_size_read(inode));
936953 i_size_write(inode, symlen);
937954 inode->i_blocks = calc_inode_blocks(symlen);
....@@ -955,7 +972,7 @@
955972 inode->i_fop = &ceph_dir_fops;
956973 break;
957974 default:
958
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
975
+ pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
959976 ceph_vinop(inode), inode->i_mode);
960977 }
961978
....@@ -964,7 +981,7 @@
964981 if (ceph_snap(inode) == CEPH_NOSNAP) {
965982 ceph_add_cap(inode, session,
966983 le64_to_cpu(info->cap.cap_id),
967
- cap_fmode, info_caps,
984
+ info_caps,
968985 le32_to_cpu(info->cap.wanted),
969986 le32_to_cpu(info->cap.seq),
970987 le32_to_cpu(info->cap.mseq),
....@@ -989,13 +1006,7 @@
9891006 dout(" %p got snap_caps %s\n", inode,
9901007 ceph_cap_string(info_caps));
9911008 ci->i_snap_caps |= info_caps;
992
- if (cap_fmode >= 0)
993
- __ceph_get_fmode(ci, cap_fmode);
9941009 }
995
- } else if (cap_fmode >= 0) {
996
- pr_warn("mds issued no caps on %llx.%llx\n",
997
- ceph_vinop(inode));
998
- __ceph_get_fmode(ci, cap_fmode);
9991010 }
10001011
10011012 if (iinfo->inline_version > 0 &&
....@@ -1005,6 +1016,13 @@
10051016 if (ci->i_inline_version != CEPH_INLINE_NONE &&
10061017 (locked_page || (info_caps & cache_caps)))
10071018 fill_inline = true;
1019
+ }
1020
+
1021
+ if (cap_fmode >= 0) {
1022
+ if (!info_caps)
1023
+ pr_warn("mds issued no caps on %llx.%llx\n",
1024
+ ceph_vinop(inode));
1025
+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
10081026 }
10091027
10101028 spin_unlock(&ci->i_ceph_lock);
....@@ -1039,62 +1057,46 @@
10391057 }
10401058
10411059 /*
1042
- * caller should hold session s_mutex.
1060
+ * caller should hold session s_mutex and dentry->d_lock.
10431061 */
1044
-static void update_dentry_lease(struct dentry *dentry,
1045
- struct ceph_mds_reply_lease *lease,
1046
- struct ceph_mds_session *session,
1047
- unsigned long from_time,
1048
- struct ceph_vino *tgt_vino,
1049
- struct ceph_vino *dir_vino)
1062
+static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
1063
+ struct ceph_mds_reply_lease *lease,
1064
+ struct ceph_mds_session *session,
1065
+ unsigned long from_time,
1066
+ struct ceph_mds_session **old_lease_session)
10501067 {
10511068 struct ceph_dentry_info *di = ceph_dentry(dentry);
1069
+ unsigned mask = le16_to_cpu(lease->mask);
10521070 long unsigned duration = le32_to_cpu(lease->duration_ms);
10531071 long unsigned ttl = from_time + (duration * HZ) / 1000;
10541072 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
1055
- struct inode *dir;
1056
- struct ceph_mds_session *old_lease_session = NULL;
10571073
1058
- /*
1059
- * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
1060
- * we expect a negative dentry.
1061
- */
1062
- if (!tgt_vino && d_really_is_positive(dentry))
1063
- return;
1064
-
1065
- if (tgt_vino && (d_really_is_negative(dentry) ||
1066
- !ceph_ino_compare(d_inode(dentry), tgt_vino)))
1067
- return;
1068
-
1069
- spin_lock(&dentry->d_lock);
10701074 dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
10711075 dentry, duration, ttl);
10721076
1073
- dir = d_inode(dentry->d_parent);
1074
-
1075
- /* make sure parent matches dir_vino */
1076
- if (!ceph_ino_compare(dir, dir_vino))
1077
- goto out_unlock;
1078
-
10791077 /* only track leases on regular dentries */
10801078 if (ceph_snap(dir) != CEPH_NOSNAP)
1081
- goto out_unlock;
1079
+ return;
1080
+
1081
+ if (mask & CEPH_LEASE_PRIMARY_LINK)
1082
+ di->flags |= CEPH_DENTRY_PRIMARY_LINK;
1083
+ else
1084
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
10821085
10831086 di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
1084
-
1085
- if (duration == 0)
1086
- goto out_unlock;
1087
+ if (!(mask & CEPH_LEASE_VALID)) {
1088
+ __ceph_dentry_dir_lease_touch(di);
1089
+ return;
1090
+ }
10871091
10881092 if (di->lease_gen == session->s_cap_gen &&
10891093 time_before(ttl, di->time))
1090
- goto out_unlock; /* we already have a newer lease. */
1094
+ return; /* we already have a newer lease. */
10911095
10921096 if (di->lease_session && di->lease_session != session) {
1093
- old_lease_session = di->lease_session;
1097
+ *old_lease_session = di->lease_session;
10941098 di->lease_session = NULL;
10951099 }
1096
-
1097
- ceph_dentry_lru_touch(dentry);
10981100
10991101 if (!di->lease_session)
11001102 di->lease_session = ceph_get_mds_session(session);
....@@ -1103,18 +1105,75 @@
11031105 di->lease_renew_after = half_ttl;
11041106 di->lease_renew_from = 0;
11051107 di->time = ttl;
1108
+
1109
+ __ceph_dentry_lease_touch(di);
1110
+}
1111
+
1112
+static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
1113
+ struct ceph_mds_reply_lease *lease,
1114
+ struct ceph_mds_session *session,
1115
+ unsigned long from_time)
1116
+{
1117
+ struct ceph_mds_session *old_lease_session = NULL;
1118
+ spin_lock(&dentry->d_lock);
1119
+ __update_dentry_lease(dir, dentry, lease, session, from_time,
1120
+ &old_lease_session);
1121
+ spin_unlock(&dentry->d_lock);
1122
+ ceph_put_mds_session(old_lease_session);
1123
+}
1124
+
1125
+/*
1126
+ * update dentry lease without having parent inode locked
1127
+ */
1128
+static void update_dentry_lease_careful(struct dentry *dentry,
1129
+ struct ceph_mds_reply_lease *lease,
1130
+ struct ceph_mds_session *session,
1131
+ unsigned long from_time,
1132
+ char *dname, u32 dname_len,
1133
+ struct ceph_vino *pdvino,
1134
+ struct ceph_vino *ptvino)
1135
+
1136
+{
1137
+ struct inode *dir;
1138
+ struct ceph_mds_session *old_lease_session = NULL;
1139
+
1140
+ spin_lock(&dentry->d_lock);
1141
+ /* make sure dentry's name matches target */
1142
+ if (dentry->d_name.len != dname_len ||
1143
+ memcmp(dentry->d_name.name, dname, dname_len))
1144
+ goto out_unlock;
1145
+
1146
+ dir = d_inode(dentry->d_parent);
1147
+ /* make sure parent matches dvino */
1148
+ if (!ceph_ino_compare(dir, pdvino))
1149
+ goto out_unlock;
1150
+
1151
+ /* make sure dentry's inode matches target. NULL ptvino means that
1152
+ * we expect a negative dentry */
1153
+ if (ptvino) {
1154
+ if (d_really_is_negative(dentry))
1155
+ goto out_unlock;
1156
+ if (!ceph_ino_compare(d_inode(dentry), ptvino))
1157
+ goto out_unlock;
1158
+ } else {
1159
+ if (d_really_is_positive(dentry))
1160
+ goto out_unlock;
1161
+ }
1162
+
1163
+ __update_dentry_lease(dir, dentry, lease, session,
1164
+ from_time, &old_lease_session);
11061165 out_unlock:
11071166 spin_unlock(&dentry->d_lock);
1108
- if (old_lease_session)
1109
- ceph_put_mds_session(old_lease_session);
1167
+ ceph_put_mds_session(old_lease_session);
11101168 }
11111169
11121170 /*
11131171 * splice a dentry to an inode.
11141172 * caller must hold directory i_mutex for this to be safe.
11151173 */
1116
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
1174
+static int splice_dentry(struct dentry **pdn, struct inode *in)
11171175 {
1176
+ struct dentry *dn = *pdn;
11181177 struct dentry *realdn;
11191178
11201179 BUG_ON(d_inode(dn));
....@@ -1147,28 +1206,23 @@
11471206 if (IS_ERR(realdn)) {
11481207 pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
11491208 PTR_ERR(realdn), dn, in, ceph_vinop(in));
1150
- dn = realdn;
1151
- /*
1152
- * Caller should release 'dn' in the case of error.
1153
- * If 'req->r_dentry' is passed to this function,
1154
- * caller should leave 'req->r_dentry' untouched.
1155
- */
1156
- goto out;
1157
- } else if (realdn) {
1209
+ return PTR_ERR(realdn);
1210
+ }
1211
+
1212
+ if (realdn) {
11581213 dout("dn %p (%d) spliced with %p (%d) "
11591214 "inode %p ino %llx.%llx\n",
11601215 dn, d_count(dn),
11611216 realdn, d_count(realdn),
11621217 d_inode(realdn), ceph_vinop(d_inode(realdn)));
11631218 dput(dn);
1164
- dn = realdn;
1219
+ *pdn = realdn;
11651220 } else {
11661221 BUG_ON(!ceph_dentry(dn));
11671222 dout("dn %p attached to %p ino %llx.%llx\n",
11681223 dn, d_inode(dn), ceph_vinop(d_inode(dn)));
11691224 }
1170
-out:
1171
- return dn;
1225
+ return 0;
11721226 }
11731227
11741228 /*
....@@ -1205,17 +1259,18 @@
12051259 struct inode *dir = req->r_parent;
12061260
12071261 if (dir) {
1208
- err = fill_inode(dir, NULL,
1209
- &rinfo->diri, rinfo->dirfrag,
1210
- session, req->r_request_started, -1,
1211
- &req->r_caps_reservation);
1262
+ err = ceph_fill_inode(dir, NULL, &rinfo->diri,
1263
+ rinfo->dirfrag, session, -1,
1264
+ &req->r_caps_reservation);
12121265 if (err < 0)
12131266 goto done;
12141267 } else {
12151268 WARN_ON_ONCE(1);
12161269 }
12171270
1218
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
1271
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
1272
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1273
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
12191274 struct qstr dname;
12201275 struct dentry *dn, *parent;
12211276
....@@ -1270,18 +1325,25 @@
12701325 err = PTR_ERR(in);
12711326 goto done;
12721327 }
1273
- req->r_target_inode = in;
12741328
1275
- err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
1276
- session, req->r_request_started,
1329
+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
1330
+ NULL, session,
12771331 (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1278
- rinfo->head->result == 0) ? req->r_fmode : -1,
1332
+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
1333
+ rinfo->head->result == 0) ? req->r_fmode : -1,
12791334 &req->r_caps_reservation);
12801335 if (err < 0) {
1281
- pr_err("fill_inode badness %p %llx.%llx\n",
1336
+ pr_err("ceph_fill_inode badness %p %llx.%llx\n",
12821337 in, ceph_vinop(in));
1338
+ if (in->i_state & I_NEW)
1339
+ discard_new_inode(in);
1340
+ else
1341
+ iput(in);
12831342 goto done;
12841343 }
1344
+ req->r_target_inode = in;
1345
+ if (in->i_state & I_NEW)
1346
+ unlock_new_inode(in);
12851347 }
12861348
12871349 /*
....@@ -1353,7 +1415,12 @@
13531415 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
13541416 ceph_dentry(req->r_old_dentry)->offset);
13551417
1356
- dn = req->r_old_dentry; /* use old_dentry */
1418
+ /* swap r_dentry and r_old_dentry in case that
1419
+ * splice_dentry() gets called later. This is safe
1420
+ * because no other place will use them */
1421
+ req->r_dentry = req->r_old_dentry;
1422
+ req->r_old_dentry = dn;
1423
+ dn = req->r_dentry;
13571424 }
13581425
13591426 /* null dentry? */
....@@ -1366,10 +1433,9 @@
13661433 } else if (have_lease) {
13671434 if (d_unhashed(dn))
13681435 d_add(dn, NULL);
1369
- update_dentry_lease(dn, rinfo->dlease,
1370
- session,
1371
- req->r_request_started,
1372
- NULL, &dvino);
1436
+ update_dentry_lease(dir, dn,
1437
+ rinfo->dlease, session,
1438
+ req->r_request_started);
13731439 }
13741440 goto done;
13751441 }
....@@ -1378,12 +1444,10 @@
13781444 if (d_really_is_negative(dn)) {
13791445 ceph_dir_clear_ordered(dir);
13801446 ihold(in);
1381
- dn = splice_dentry(dn, in);
1382
- if (IS_ERR(dn)) {
1383
- err = PTR_ERR(dn);
1447
+ err = splice_dentry(&req->r_dentry, in);
1448
+ if (err < 0)
13841449 goto done;
1385
- }
1386
- req->r_dentry = dn; /* may have spliced */
1450
+ dn = req->r_dentry; /* may have spliced */
13871451 } else if (d_really_is_positive(dn) && d_inode(dn) != in) {
13881452 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
13891453 dn, d_inode(dn), ceph_vinop(d_inode(dn)),
....@@ -1393,53 +1457,41 @@
13931457 }
13941458
13951459 if (have_lease) {
1396
- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1397
- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1398
- update_dentry_lease(dn, rinfo->dlease, session,
1399
- req->r_request_started,
1400
- &tvino, &dvino);
1460
+ update_dentry_lease(dir, dn,
1461
+ rinfo->dlease, session,
1462
+ req->r_request_started);
14011463 }
14021464 dout(" final dn %p\n", dn);
14031465 } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
14041466 req->r_op == CEPH_MDS_OP_MKSNAP) &&
14051467 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
14061468 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1407
- struct dentry *dn = req->r_dentry;
14081469 struct inode *dir = req->r_parent;
14091470
14101471 /* fill out a snapdir LOOKUPSNAP dentry */
1411
- BUG_ON(!dn);
14121472 BUG_ON(!dir);
14131473 BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
1414
- dout(" linking snapped dir %p to dn %p\n", in, dn);
1474
+ BUG_ON(!req->r_dentry);
1475
+ dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
14151476 ceph_dir_clear_ordered(dir);
14161477 ihold(in);
1417
- dn = splice_dentry(dn, in);
1418
- if (IS_ERR(dn)) {
1419
- err = PTR_ERR(dn);
1478
+ err = splice_dentry(&req->r_dentry, in);
1479
+ if (err < 0)
14201480 goto done;
1421
- }
1422
- req->r_dentry = dn; /* may have spliced */
1423
- } else if (rinfo->head->is_dentry) {
1481
+ } else if (rinfo->head->is_dentry && req->r_dentry) {
1482
+ /* parent inode is not locked, be carefull */
14241483 struct ceph_vino *ptvino = NULL;
1425
-
1426
- if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
1427
- le32_to_cpu(rinfo->dlease->duration_ms)) {
1428
- dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1429
- dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1430
-
1431
- if (rinfo->head->is_target) {
1432
- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1433
- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1434
- ptvino = &tvino;
1435
- }
1436
-
1437
- update_dentry_lease(req->r_dentry, rinfo->dlease,
1438
- session, req->r_request_started, ptvino,
1439
- &dvino);
1440
- } else {
1441
- dout("%s: no dentry lease or dir cap\n", __func__);
1484
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1485
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1486
+ if (rinfo->head->is_target) {
1487
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1488
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1489
+ ptvino = &tvino;
14421490 }
1491
+ update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
1492
+ session, req->r_request_started,
1493
+ rinfo->dname, rinfo->dname_len,
1494
+ &dvino, ptvino);
14431495 }
14441496 done:
14451497 dout("fill_trace done err=%d\n", err);
....@@ -1470,14 +1522,22 @@
14701522 dout("new_inode badness got %d\n", err);
14711523 continue;
14721524 }
1473
- rc = fill_inode(in, NULL, &rde->inode, NULL, session,
1474
- req->r_request_started, -1,
1475
- &req->r_caps_reservation);
1525
+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
1526
+ -1, &req->r_caps_reservation);
14761527 if (rc < 0) {
1477
- pr_err("fill_inode badness on %p got %d\n", in, rc);
1528
+ pr_err("ceph_fill_inode badness on %p got %d\n",
1529
+ in, rc);
14781530 err = rc;
1531
+ if (in->i_state & I_NEW) {
1532
+ ihold(in);
1533
+ discard_new_inode(in);
1534
+ }
1535
+ } else if (in->i_state & I_NEW) {
1536
+ unlock_new_inode(in);
14791537 }
1480
- iput(in);
1538
+
1539
+ /* avoid calling iput_final() in mds dispatch threads */
1540
+ ceph_async_iput(in);
14811541 }
14821542
14831543 return err;
....@@ -1600,7 +1660,7 @@
16001660 /* FIXME: release caps/leases if error occurs */
16011661 for (i = 0; i < rinfo->dir_nr; i++) {
16021662 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1603
- struct ceph_vino tvino, dvino;
1663
+ struct ceph_vino tvino;
16041664
16051665 dname.name = rde->name;
16061666 dname.len = rde->name_len;
....@@ -1670,43 +1730,45 @@
16701730 }
16711731 }
16721732
1673
- ret = fill_inode(in, NULL, &rde->inode, NULL, session,
1674
- req->r_request_started, -1,
1675
- &req->r_caps_reservation);
1733
+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
1734
+ -1, &req->r_caps_reservation);
16761735 if (ret < 0) {
1677
- pr_err("fill_inode badness on %p\n", in);
1678
- if (d_really_is_negative(dn))
1679
- iput(in);
1736
+ pr_err("ceph_fill_inode badness on %p\n", in);
1737
+ if (d_really_is_negative(dn)) {
1738
+ /* avoid calling iput_final() in mds
1739
+ * dispatch threads */
1740
+ if (in->i_state & I_NEW) {
1741
+ ihold(in);
1742
+ discard_new_inode(in);
1743
+ }
1744
+ ceph_async_iput(in);
1745
+ }
16801746 d_drop(dn);
16811747 err = ret;
16821748 goto next_item;
16831749 }
1750
+ if (in->i_state & I_NEW)
1751
+ unlock_new_inode(in);
16841752
16851753 if (d_really_is_negative(dn)) {
1686
- struct dentry *realdn;
1687
-
16881754 if (ceph_security_xattr_deadlock(in)) {
16891755 dout(" skip splicing dn %p to inode %p"
16901756 " (security xattr deadlock)\n", dn, in);
1691
- iput(in);
1757
+ ceph_async_iput(in);
16921758 skipped++;
16931759 goto next_item;
16941760 }
16951761
1696
- realdn = splice_dentry(dn, in);
1697
- if (IS_ERR(realdn)) {
1698
- err = PTR_ERR(realdn);
1699
- d_drop(dn);
1762
+ err = splice_dentry(&dn, in);
1763
+ if (err < 0)
17001764 goto next_item;
1701
- }
1702
- dn = realdn;
17031765 }
17041766
17051767 ceph_dentry(dn)->offset = rde->offset;
17061768
1707
- dvino = ceph_vino(d_inode(parent));
1708
- update_dentry_lease(dn, rde->lease, req->r_session,
1709
- req->r_request_started, &tvino, &dvino);
1769
+ update_dentry_lease(d_inode(parent), dn,
1770
+ rde->lease, req->r_session,
1771
+ req->r_request_started);
17101772
17111773 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
17121774 ret = fill_readdir_cache(d_inode(parent), dn,
....@@ -1715,8 +1777,7 @@
17151777 err = ret;
17161778 }
17171779 next_item:
1718
- if (dn)
1719
- dput(dn);
1780
+ dput(dn);
17201781 }
17211782 out:
17221783 if (err == 0 && skipped == 0) {
....@@ -1745,30 +1806,42 @@
17451806 }
17461807
17471808 /*
1809
+ * Put reference to inode, but avoid calling iput_final() in current thread.
1810
+ * iput_final() may wait for reahahead pages. The wait can cause deadlock in
1811
+ * some contexts.
1812
+ */
1813
+void ceph_async_iput(struct inode *inode)
1814
+{
1815
+ if (!inode)
1816
+ return;
1817
+ for (;;) {
1818
+ if (atomic_add_unless(&inode->i_count, -1, 1))
1819
+ break;
1820
+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
1821
+ &ceph_inode(inode)->i_work))
1822
+ break;
1823
+ /* queue work failed, i_count must be at least 2 */
1824
+ }
1825
+}
1826
+
1827
+/*
17481828 * Write back inode data in a worker thread. (This can't be done
17491829 * in the message handler context.)
17501830 */
17511831 void ceph_queue_writeback(struct inode *inode)
17521832 {
1833
+ struct ceph_inode_info *ci = ceph_inode(inode);
1834
+ set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
1835
+
17531836 ihold(inode);
1754
- if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1755
- &ceph_inode(inode)->i_wb_work)) {
1837
+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
1838
+ &ci->i_work)) {
17561839 dout("ceph_queue_writeback %p\n", inode);
17571840 } else {
1758
- dout("ceph_queue_writeback %p failed\n", inode);
1841
+ dout("ceph_queue_writeback %p already queued, mask=%lx\n",
1842
+ inode, ci->i_work_mask);
17591843 iput(inode);
17601844 }
1761
-}
1762
-
1763
-static void ceph_writeback_work(struct work_struct *work)
1764
-{
1765
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1766
- i_wb_work);
1767
- struct inode *inode = &ci->vfs_inode;
1768
-
1769
- dout("writeback %p\n", inode);
1770
- filemap_fdatawrite(&inode->i_data);
1771
- iput(inode);
17721845 }
17731846
17741847 /*
....@@ -1776,25 +1849,43 @@
17761849 */
17771850 void ceph_queue_invalidate(struct inode *inode)
17781851 {
1852
+ struct ceph_inode_info *ci = ceph_inode(inode);
1853
+ set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
1854
+
17791855 ihold(inode);
1780
- if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1781
- &ceph_inode(inode)->i_pg_inv_work)) {
1856
+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
1857
+ &ceph_inode(inode)->i_work)) {
17821858 dout("ceph_queue_invalidate %p\n", inode);
17831859 } else {
1784
- dout("ceph_queue_invalidate %p failed\n", inode);
1860
+ dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
1861
+ inode, ci->i_work_mask);
17851862 iput(inode);
17861863 }
17871864 }
17881865
17891866 /*
1790
- * Invalidate inode pages in a worker thread. (This can't be done
1791
- * in the message handler context.)
1867
+ * Queue an async vmtruncate. If we fail to queue work, we will handle
1868
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
17921869 */
1793
-static void ceph_invalidate_work(struct work_struct *work)
1870
+void ceph_queue_vmtruncate(struct inode *inode)
17941871 {
1795
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1796
- i_pg_inv_work);
1797
- struct inode *inode = &ci->vfs_inode;
1872
+ struct ceph_inode_info *ci = ceph_inode(inode);
1873
+ set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
1874
+
1875
+ ihold(inode);
1876
+ if (queue_work(ceph_inode_to_client(inode)->inode_wq,
1877
+ &ci->i_work)) {
1878
+ dout("ceph_queue_vmtruncate %p\n", inode);
1879
+ } else {
1880
+ dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
1881
+ inode, ci->i_work_mask);
1882
+ iput(inode);
1883
+ }
1884
+}
1885
+
1886
+static void ceph_do_invalidate_pages(struct inode *inode)
1887
+{
1888
+ struct ceph_inode_info *ci = ceph_inode(inode);
17981889 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
17991890 u32 orig_gen;
18001891 int check = 0;
....@@ -1847,44 +1938,6 @@
18471938 out:
18481939 if (check)
18491940 ceph_check_caps(ci, 0, NULL);
1850
- iput(inode);
1851
-}
1852
-
1853
-
1854
-/*
1855
- * called by trunc_wq;
1856
- *
1857
- * We also truncate in a separate thread as well.
1858
- */
1859
-static void ceph_vmtruncate_work(struct work_struct *work)
1860
-{
1861
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1862
- i_vmtruncate_work);
1863
- struct inode *inode = &ci->vfs_inode;
1864
-
1865
- dout("vmtruncate_work %p\n", inode);
1866
- __ceph_do_pending_vmtruncate(inode);
1867
- iput(inode);
1868
-}
1869
-
1870
-/*
1871
- * Queue an async vmtruncate. If we fail to queue work, we will handle
1872
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
1873
- */
1874
-void ceph_queue_vmtruncate(struct inode *inode)
1875
-{
1876
- struct ceph_inode_info *ci = ceph_inode(inode);
1877
-
1878
- ihold(inode);
1879
-
1880
- if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1881
- &ci->i_vmtruncate_work)) {
1882
- dout("ceph_queue_vmtruncate %p\n", inode);
1883
- } else {
1884
- dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1885
- inode, ci->i_truncate_pending);
1886
- iput(inode);
1887
- }
18881941 }
18891942
18901943 /*
....@@ -1943,9 +1996,28 @@
19431996 mutex_unlock(&ci->i_truncate_mutex);
19441997
19451998 if (wrbuffer_refs == 0)
1946
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1999
+ ceph_check_caps(ci, 0, NULL);
19472000
19482001 wake_up_all(&ci->i_cap_wq);
2002
+}
2003
+
2004
+static void ceph_inode_work(struct work_struct *work)
2005
+{
2006
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
2007
+ i_work);
2008
+ struct inode *inode = &ci->vfs_inode;
2009
+
2010
+ if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
2011
+ dout("writeback %p\n", inode);
2012
+ filemap_fdatawrite(&inode->i_data);
2013
+ }
2014
+ if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
2015
+ ceph_do_invalidate_pages(inode);
2016
+
2017
+ if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
2018
+ __ceph_do_pending_vmtruncate(inode);
2019
+
2020
+ iput(inode);
19492021 }
19502022
19512023 /*
....@@ -1961,7 +2033,7 @@
19612033 int __ceph_setattr(struct inode *inode, struct iattr *attr)
19622034 {
19632035 struct ceph_inode_info *ci = ceph_inode(inode);
1964
- const unsigned int ia_valid = attr->ia_valid;
2036
+ unsigned int ia_valid = attr->ia_valid;
19652037 struct ceph_mds_request *req;
19662038 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
19672039 struct ceph_cap_flush *prealloc_cf;
....@@ -2066,6 +2138,26 @@
20662138 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
20672139 }
20682140 }
2141
+ if (ia_valid & ATTR_SIZE) {
2142
+ dout("setattr %p size %lld -> %lld\n", inode,
2143
+ inode->i_size, attr->ia_size);
2144
+ if ((issued & CEPH_CAP_FILE_EXCL) &&
2145
+ attr->ia_size > inode->i_size) {
2146
+ i_size_write(inode, attr->ia_size);
2147
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
2148
+ ci->i_reported_size = attr->ia_size;
2149
+ dirtied |= CEPH_CAP_FILE_EXCL;
2150
+ ia_valid |= ATTR_MTIME;
2151
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2152
+ attr->ia_size != inode->i_size) {
2153
+ req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2154
+ req->r_args.setattr.old_size =
2155
+ cpu_to_le64(inode->i_size);
2156
+ mask |= CEPH_SETATTR_SIZE;
2157
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2158
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2159
+ }
2160
+ }
20692161 if (ia_valid & ATTR_MTIME) {
20702162 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
20712163 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
....@@ -2085,25 +2177,6 @@
20852177 &attr->ia_mtime);
20862178 mask |= CEPH_SETATTR_MTIME;
20872179 release |= CEPH_CAP_FILE_SHARED |
2088
- CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2089
- }
2090
- }
2091
- if (ia_valid & ATTR_SIZE) {
2092
- dout("setattr %p size %lld -> %lld\n", inode,
2093
- inode->i_size, attr->ia_size);
2094
- if ((issued & CEPH_CAP_FILE_EXCL) &&
2095
- attr->ia_size > inode->i_size) {
2096
- i_size_write(inode, attr->ia_size);
2097
- inode->i_blocks = calc_inode_blocks(attr->ia_size);
2098
- ci->i_reported_size = attr->ia_size;
2099
- dirtied |= CEPH_CAP_FILE_EXCL;
2100
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2101
- attr->ia_size != inode->i_size) {
2102
- req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2103
- req->r_args.setattr.old_size =
2104
- cpu_to_le64(inode->i_size);
2105
- mask |= CEPH_SETATTR_SIZE;
2106
- release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
21072180 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
21082181 }
21092182 }
....@@ -2223,8 +2296,8 @@
22232296
22242297 dout("do_getattr inode %p mask %s mode 0%o\n",
22252298 inode, ceph_cap_string(mask), inode->i_mode);
2226
- if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
2227
- return 0;
2299
+ if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
2300
+ return 0;
22282301
22292302 mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
22302303 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
....@@ -2271,42 +2344,82 @@
22712344 return err;
22722345 }
22732346
2347
+/* Craft a mask of needed caps given a set of requested statx attrs. */
2348
+static int statx_to_caps(u32 want)
2349
+{
2350
+ int mask = 0;
2351
+
2352
+ if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
2353
+ mask |= CEPH_CAP_AUTH_SHARED;
2354
+
2355
+ if (want & (STATX_NLINK|STATX_CTIME))
2356
+ mask |= CEPH_CAP_LINK_SHARED;
2357
+
2358
+ if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
2359
+ STATX_BLOCKS))
2360
+ mask |= CEPH_CAP_FILE_SHARED;
2361
+
2362
+ if (want & (STATX_CTIME))
2363
+ mask |= CEPH_CAP_XATTR_SHARED;
2364
+
2365
+ return mask;
2366
+}
2367
+
22742368 /*
2275
- * Get all attributes. Hopefully somedata we'll have a statlite()
2276
- * and can limit the fields we require to be accurate.
2369
+ * Get all the attributes. If we have sufficient caps for the requested attrs,
2370
+ * then we can avoid talking to the MDS at all.
22772371 */
22782372 int ceph_getattr(const struct path *path, struct kstat *stat,
22792373 u32 request_mask, unsigned int flags)
22802374 {
22812375 struct inode *inode = d_inode(path->dentry);
22822376 struct ceph_inode_info *ci = ceph_inode(inode);
2283
- int err;
2377
+ u32 valid_mask = STATX_BASIC_STATS;
2378
+ int err = 0;
22842379
2285
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
2286
- if (!err) {
2287
- generic_fillattr(inode, stat);
2288
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
2289
- if (ceph_snap(inode) != CEPH_NOSNAP)
2290
- stat->dev = ceph_snap(inode);
2291
- else
2292
- stat->dev = 0;
2293
- if (S_ISDIR(inode->i_mode)) {
2294
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
2295
- RBYTES))
2296
- stat->size = ci->i_rbytes;
2297
- else
2298
- stat->size = ci->i_files + ci->i_subdirs;
2299
- stat->blocks = 0;
2300
- stat->blksize = 65536;
2301
- /*
2302
- * Some applications rely on the number of st_nlink
2303
- * value on directories to be either 0 (if unlinked)
2304
- * or 2 + number of subdirectories.
2305
- */
2306
- if (stat->nlink == 1)
2307
- /* '.' + '..' + subdirs */
2308
- stat->nlink = 1 + 1 + ci->i_subdirs;
2309
- }
2380
+ /* Skip the getattr altogether if we're asked not to sync */
2381
+ if (!(flags & AT_STATX_DONT_SYNC)) {
2382
+ err = ceph_do_getattr(inode, statx_to_caps(request_mask),
2383
+ flags & AT_STATX_FORCE_SYNC);
2384
+ if (err)
2385
+ return err;
23102386 }
2387
+
2388
+ generic_fillattr(inode, stat);
2389
+ stat->ino = ceph_present_inode(inode);
2390
+
2391
+ /*
2392
+ * btime on newly-allocated inodes is 0, so if this is still set to
2393
+ * that, then assume that it's not valid.
2394
+ */
2395
+ if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
2396
+ stat->btime = ci->i_btime;
2397
+ valid_mask |= STATX_BTIME;
2398
+ }
2399
+
2400
+ if (ceph_snap(inode) == CEPH_NOSNAP)
2401
+ stat->dev = inode->i_sb->s_dev;
2402
+ else
2403
+ stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
2404
+
2405
+ if (S_ISDIR(inode->i_mode)) {
2406
+ if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
2407
+ RBYTES))
2408
+ stat->size = ci->i_rbytes;
2409
+ else
2410
+ stat->size = ci->i_files + ci->i_subdirs;
2411
+ stat->blocks = 0;
2412
+ stat->blksize = 65536;
2413
+ /*
2414
+ * Some applications rely on the number of st_nlink
2415
+ * value on directories to be either 0 (if unlinked)
2416
+ * or 2 + number of subdirectories.
2417
+ */
2418
+ if (stat->nlink == 1)
2419
+ /* '.' + '..' + subdirs */
2420
+ stat->nlink = 1 + 1 + ci->i_subdirs;
2421
+ }
2422
+
2423
+ stat->result_mask = request_mask & valid_mask;
23112424 return err;
23122425 }