hc
2024-05-10 cde9070d9970eef1f7ec2360586c802a16230ad8
kernel/fs/ceph/caps.c
....@@ -8,6 +8,7 @@
88 #include <linux/vmalloc.h>
99 #include <linux/wait.h>
1010 #include <linux/writeback.h>
11
+#include <linux/iversion.h>
1112
1213 #include "super.h"
1314 #include "mds_client.h"
....@@ -148,11 +149,17 @@
148149 spin_unlock(&mdsc->caps_list_lock);
149150 }
150151
151
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
152
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
153
+ struct ceph_mount_options *fsopt)
152154 {
153155 spin_lock(&mdsc->caps_list_lock);
154
- mdsc->caps_min_count += delta;
155
- BUG_ON(mdsc->caps_min_count < 0);
156
+ mdsc->caps_min_count = fsopt->max_readdir;
157
+ if (mdsc->caps_min_count < 1024)
158
+ mdsc->caps_min_count = 1024;
159
+ mdsc->caps_use_max = fsopt->caps_max;
160
+ if (mdsc->caps_use_max > 0 &&
161
+ mdsc->caps_use_max < mdsc->caps_min_count)
162
+ mdsc->caps_use_max = mdsc->caps_min_count;
156163 spin_unlock(&mdsc->caps_list_lock);
157164 }
158165
....@@ -272,6 +279,7 @@
272279 if (!err) {
273280 BUG_ON(have + alloc != need);
274281 ctx->count = need;
282
+ ctx->used = 0;
275283 }
276284
277285 spin_lock(&mdsc->caps_list_lock);
....@@ -295,13 +303,24 @@
295303 }
296304
297305 void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
298
- struct ceph_cap_reservation *ctx)
306
+ struct ceph_cap_reservation *ctx)
299307 {
308
+ bool reclaim = false;
309
+ if (!ctx->count)
310
+ return;
311
+
300312 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
301313 spin_lock(&mdsc->caps_list_lock);
302314 __ceph_unreserve_caps(mdsc, ctx->count);
303315 ctx->count = 0;
316
+
317
+ if (mdsc->caps_use_max > 0 &&
318
+ mdsc->caps_use_count > mdsc->caps_use_max)
319
+ reclaim = true;
304320 spin_unlock(&mdsc->caps_list_lock);
321
+
322
+ if (reclaim)
323
+ ceph_reclaim_caps_nr(mdsc, ctx->used);
305324 }
306325
307326 struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
....@@ -346,6 +365,7 @@
346365 BUG_ON(list_empty(&mdsc->caps_list));
347366
348367 ctx->count--;
368
+ ctx->used++;
349369 mdsc->caps_reserve_count--;
350370 mdsc->caps_use_count++;
351371
....@@ -438,37 +458,6 @@
438458 }
439459
440460 /*
441
- * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
442
- */
443
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
444
-{
445
- struct ceph_cap *cap;
446
- int mds = -1;
447
- struct rb_node *p;
448
-
449
- /* prefer mds with WR|BUFFER|EXCL caps */
450
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
451
- cap = rb_entry(p, struct ceph_cap, ci_node);
452
- mds = cap->mds;
453
- if (cap->issued & (CEPH_CAP_FILE_WR |
454
- CEPH_CAP_FILE_BUFFER |
455
- CEPH_CAP_FILE_EXCL))
456
- break;
457
- }
458
- return mds;
459
-}
460
-
461
-int ceph_get_cap_mds(struct inode *inode)
462
-{
463
- struct ceph_inode_info *ci = ceph_inode(inode);
464
- int mds;
465
- spin_lock(&ci->i_ceph_lock);
466
- mds = __ceph_get_cap_mds(ceph_inode(inode));
467
- spin_unlock(&ci->i_ceph_lock);
468
- return mds;
469
-}
470
-
471
-/*
472461 * Called under i_ceph_lock.
473462 */
474463 static void __insert_cap_node(struct ceph_inode_info *ci,
....@@ -500,14 +489,11 @@
500489 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
501490 struct ceph_inode_info *ci)
502491 {
503
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
504
-
505
- ci->i_hold_caps_min = round_jiffies(jiffies +
506
- ma->caps_wanted_delay_min * HZ);
492
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
507493 ci->i_hold_caps_max = round_jiffies(jiffies +
508
- ma->caps_wanted_delay_max * HZ);
509
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
510
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
494
+ opt->caps_wanted_delay_max * HZ);
495
+ dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
496
+ ci->i_hold_caps_max - jiffies);
511497 }
512498
513499 /*
....@@ -521,8 +507,7 @@
521507 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
522508 struct ceph_inode_info *ci)
523509 {
524
- __cap_set_timeouts(mdsc, ci);
525
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
510
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
526511 ci->i_ceph_flags, ci->i_hold_caps_max);
527512 if (!mdsc->stopping) {
528513 spin_lock(&mdsc->cap_delay_lock);
....@@ -531,6 +516,7 @@
531516 goto no_change;
532517 list_del_init(&ci->i_cap_delay_list);
533518 }
519
+ __cap_set_timeouts(mdsc, ci);
534520 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
535521 no_change:
536522 spin_unlock(&mdsc->cap_delay_lock);
....@@ -570,19 +556,20 @@
570556 spin_unlock(&mdsc->cap_delay_lock);
571557 }
572558
573
-/*
574
- * Common issue checks for add_cap, handle_cap_grant.
575
- */
559
+/* Common issue checks for add_cap, handle_cap_grant. */
576560 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
577561 unsigned issued)
578562 {
579563 unsigned had = __ceph_caps_issued(ci, NULL);
580564
565
+ lockdep_assert_held(&ci->i_ceph_lock);
566
+
581567 /*
582568 * Each time we receive FILE_CACHE anew, we increment
583569 * i_rdcache_gen.
584570 */
585
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
571
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
572
+ (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
586573 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
587574 ci->i_rdcache_gen++;
588575 }
....@@ -601,12 +588,40 @@
601588 __ceph_dir_clear_complete(ci);
602589 }
603590 }
591
+
592
+ /* Wipe saved layout if we're losing DIR_CREATE caps */
593
+ if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
594
+ !(issued & CEPH_CAP_DIR_CREATE)) {
595
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
596
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
597
+ }
598
+}
599
+
600
+/**
601
+ * change_auth_cap_ses - move inode to appropriate lists when auth caps change
602
+ * @ci: inode to be moved
603
+ * @session: new auth caps session
604
+ */
605
+static void change_auth_cap_ses(struct ceph_inode_info *ci,
606
+ struct ceph_mds_session *session)
607
+{
608
+ lockdep_assert_held(&ci->i_ceph_lock);
609
+
610
+ if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
611
+ return;
612
+
613
+ spin_lock(&session->s_mdsc->cap_dirty_lock);
614
+ if (!list_empty(&ci->i_dirty_item))
615
+ list_move(&ci->i_dirty_item, &session->s_cap_dirty);
616
+ if (!list_empty(&ci->i_flushing_item))
617
+ list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
618
+ spin_unlock(&session->s_mdsc->cap_dirty_lock);
604619 }
605620
606621 /*
607622 * Add a capability under the given MDS session.
608623 *
609
- * Caller should hold session snap_rwsem (read) and s_mutex.
624
+ * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
610625 *
611626 * @fmode is the open file mode, if we are opening a file, otherwise
612627 * it is < 0. (This is so we can atomically add the cap and add an
....@@ -614,7 +629,7 @@
614629 */
615630 void ceph_add_cap(struct inode *inode,
616631 struct ceph_mds_session *session, u64 cap_id,
617
- int fmode, unsigned issued, unsigned wanted,
632
+ unsigned issued, unsigned wanted,
618633 unsigned seq, unsigned mseq, u64 realmino, int flags,
619634 struct ceph_cap **new_cap)
620635 {
....@@ -623,16 +638,16 @@
623638 struct ceph_cap *cap;
624639 int mds = session->s_mds;
625640 int actual_wanted;
641
+ u32 gen;
642
+
643
+ lockdep_assert_held(&ci->i_ceph_lock);
626644
627645 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
628646 session->s_mds, cap_id, ceph_cap_string(issued), seq);
629647
630
- /*
631
- * If we are opening the file, include file mode wanted bits
632
- * in wanted.
633
- */
634
- if (fmode >= 0)
635
- wanted |= ceph_caps_for_mode(fmode);
648
+ spin_lock(&session->s_gen_ttl_lock);
649
+ gen = session->s_cap_gen;
650
+ spin_unlock(&session->s_gen_ttl_lock);
636651
637652 cap = __get_cap_for_mds(ci, mds);
638653 if (!cap) {
....@@ -653,8 +668,16 @@
653668 spin_lock(&session->s_cap_lock);
654669 list_add_tail(&cap->session_caps, &session->s_caps);
655670 session->s_nr_caps++;
671
+ atomic64_inc(&mdsc->metric.total_caps);
656672 spin_unlock(&session->s_cap_lock);
657673 } else {
674
+ spin_lock(&session->s_cap_lock);
675
+ list_move_tail(&cap->session_caps, &session->s_caps);
676
+ spin_unlock(&session->s_cap_lock);
677
+
678
+ if (cap->cap_gen < gen)
679
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
680
+
658681 /*
659682 * auth mds of the inode changed. we received the cap export
660683 * message, but still haven't received the cap import message.
....@@ -726,6 +749,9 @@
726749 if (flags & CEPH_CAP_FLAG_AUTH) {
727750 if (!ci->i_auth_cap ||
728751 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
752
+ if (ci->i_auth_cap &&
753
+ ci->i_auth_cap->session != cap->session)
754
+ change_auth_cap_ses(ci, cap->session);
729755 ci->i_auth_cap = cap;
730756 cap->mds_wanted = wanted;
731757 }
....@@ -746,10 +772,7 @@
746772 cap->seq = seq;
747773 cap->issue_seq = seq;
748774 cap->mseq = mseq;
749
- cap->cap_gen = session->s_cap_gen;
750
-
751
- if (fmode >= 0)
752
- __ceph_get_fmode(ci, fmode);
775
+ cap->cap_gen = gen;
753776 }
754777
755778 /*
....@@ -864,8 +887,8 @@
864887 int have = ci->i_snap_caps;
865888
866889 if ((have & mask) == mask) {
867
- dout("__ceph_caps_issued_mask %p snap issued %s"
868
- " (mask %s)\n", &ci->vfs_inode,
890
+ dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
891
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
869892 ceph_cap_string(have),
870893 ceph_cap_string(mask));
871894 return 1;
....@@ -876,8 +899,8 @@
876899 if (!__cap_is_valid(cap))
877900 continue;
878901 if ((cap->issued & mask) == mask) {
879
- dout("__ceph_caps_issued_mask %p cap %p issued %s"
880
- " (mask %s)\n", &ci->vfs_inode, cap,
902
+ dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
903
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
881904 ceph_cap_string(cap->issued),
882905 ceph_cap_string(mask));
883906 if (touch)
....@@ -888,8 +911,8 @@
888911 /* does a combination of caps satisfy mask? */
889912 have |= cap->issued;
890913 if ((have & mask) == mask) {
891
- dout("__ceph_caps_issued_mask %p combo issued %s"
892
- " (mask %s)\n", &ci->vfs_inode,
914
+ dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
915
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
893916 ceph_cap_string(cap->issued),
894917 ceph_cap_string(mask));
895918 if (touch) {
....@@ -903,7 +926,8 @@
903926 ci_node);
904927 if (!__cap_is_valid(cap))
905928 continue;
906
- __touch_cap(cap);
929
+ if (cap->issued & mask)
930
+ __touch_cap(cap);
907931 }
908932 }
909933 return 1;
....@@ -911,6 +935,20 @@
911935 }
912936
913937 return 0;
938
+}
939
+
940
+int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
941
+ int touch)
942
+{
943
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
944
+ int r;
945
+
946
+ r = __ceph_caps_issued_mask(ci, mask, touch);
947
+ if (r)
948
+ ceph_update_cap_hit(&fsc->mdsc->metric);
949
+ else
950
+ ceph_update_cap_mis(&fsc->mdsc->metric);
951
+ return r;
914952 }
915953
916954 /*
....@@ -952,29 +990,97 @@
952990 if (ci->i_rd_ref)
953991 used |= CEPH_CAP_FILE_RD;
954992 if (ci->i_rdcache_ref ||
955
- (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
993
+ (S_ISREG(ci->vfs_inode.i_mode) &&
956994 ci->vfs_inode.i_data.nrpages))
957995 used |= CEPH_CAP_FILE_CACHE;
958996 if (ci->i_wr_ref)
959997 used |= CEPH_CAP_FILE_WR;
960998 if (ci->i_wb_ref || ci->i_wrbuffer_ref)
961999 used |= CEPH_CAP_FILE_BUFFER;
1000
+ if (ci->i_fx_ref)
1001
+ used |= CEPH_CAP_FILE_EXCL;
9621002 return used;
9631003 }
1004
+
1005
+#define FMODE_WAIT_BIAS 1000
9641006
9651007 /*
9661008 * wanted, by virtue of open file modes
9671009 */
9681010 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
9691011 {
970
- int i, bits = 0;
971
- for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
972
- if (ci->i_nr_by_mode[i])
973
- bits |= 1 << i;
1012
+ const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
1013
+ const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
1014
+ const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
1015
+ const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
1016
+ struct ceph_mount_options *opt =
1017
+ ceph_inode_to_client(&ci->vfs_inode)->mount_options;
1018
+ unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
1019
+ unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
1020
+
1021
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
1022
+ int want = 0;
1023
+
1024
+ /* use used_cutoff here, to keep dir's wanted caps longer */
1025
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
1026
+ time_after(ci->i_last_rd, used_cutoff))
1027
+ want |= CEPH_CAP_ANY_SHARED;
1028
+
1029
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
1030
+ time_after(ci->i_last_wr, used_cutoff)) {
1031
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1032
+ if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1033
+ want |= CEPH_CAP_ANY_DIR_OPS;
1034
+ }
1035
+
1036
+ if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
1037
+ want |= CEPH_CAP_PIN;
1038
+
1039
+ return want;
1040
+ } else {
1041
+ int bits = 0;
1042
+
1043
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
1044
+ if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
1045
+ time_after(ci->i_last_rd, used_cutoff))
1046
+ bits |= 1 << RD_SHIFT;
1047
+ } else if (time_after(ci->i_last_rd, idle_cutoff)) {
1048
+ bits |= 1 << RD_SHIFT;
1049
+ }
1050
+
1051
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
1052
+ if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
1053
+ time_after(ci->i_last_wr, used_cutoff))
1054
+ bits |= 1 << WR_SHIFT;
1055
+ } else if (time_after(ci->i_last_wr, idle_cutoff)) {
1056
+ bits |= 1 << WR_SHIFT;
1057
+ }
1058
+
1059
+ /* check lazyio only when read/write is wanted */
1060
+ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
1061
+ ci->i_nr_by_mode[LAZY_SHIFT] > 0)
1062
+ bits |= 1 << LAZY_SHIFT;
1063
+
1064
+ return bits ? ceph_caps_for_mode(bits >> 1) : 0;
9741065 }
975
- if (bits == 0)
976
- return 0;
977
- return ceph_caps_for_mode(bits >> 1);
1066
+}
1067
+
1068
+/*
1069
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1070
+ */
1071
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
1072
+{
1073
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
1074
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
1075
+ /* we want EXCL if holding caps of dir ops */
1076
+ if (w & CEPH_CAP_ANY_DIR_OPS)
1077
+ w |= CEPH_CAP_FILE_EXCL;
1078
+ } else {
1079
+ /* we want EXCL if dirty data */
1080
+ if (w & CEPH_CAP_FILE_BUFFER)
1081
+ w |= CEPH_CAP_FILE_EXCL;
1082
+ }
1083
+ return w;
9781084 }
9791085
9801086 /*
....@@ -998,26 +1104,13 @@
9981104 return mds_wanted;
9991105 }
10001106
1001
-/*
1002
- * called under i_ceph_lock
1003
- */
1004
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
1005
-{
1006
- return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
1007
-}
1008
-
1009
-static int __ceph_is_any_caps(struct ceph_inode_info *ci)
1010
-{
1011
- return !RB_EMPTY_ROOT(&ci->i_caps);
1012
-}
1013
-
10141107 int ceph_is_any_caps(struct inode *inode)
10151108 {
10161109 struct ceph_inode_info *ci = ceph_inode(inode);
10171110 int ret;
10181111
10191112 spin_lock(&ci->i_ceph_lock);
1020
- ret = __ceph_is_any_caps(ci);
1113
+ ret = __ceph_is_any_real_caps(ci);
10211114 spin_unlock(&ci->i_ceph_lock);
10221115
10231116 return ret;
....@@ -1062,8 +1155,10 @@
10621155
10631156 /* remove from inode's cap rbtree, and clear auth cap */
10641157 rb_erase(&cap->ci_node, &ci->i_caps);
1065
- if (ci->i_auth_cap == cap)
1158
+ if (ci->i_auth_cap == cap) {
1159
+ WARN_ON_ONCE(!list_empty(&ci->i_dirty_item));
10661160 ci->i_auth_cap = NULL;
1161
+ }
10671162
10681163 /* remove from session list */
10691164 spin_lock(&session->s_cap_lock);
....@@ -1074,6 +1169,7 @@
10741169 } else {
10751170 list_del_init(&cap->session_caps);
10761171 session->s_nr_caps--;
1172
+ atomic64_dec(&mdsc->metric.total_caps);
10771173 cap->session = NULL;
10781174 removed = 1;
10791175 }
....@@ -1088,9 +1184,7 @@
10881184 (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
10891185 cap->queue_release = 1;
10901186 if (removed) {
1091
- list_add_tail(&cap->session_caps,
1092
- &session->s_cap_releases);
1093
- session->s_num_cap_releases++;
1187
+ __ceph_queue_cap_release(session, cap);
10941188 removed = 0;
10951189 }
10961190 } else {
....@@ -1103,15 +1197,16 @@
11031197 if (removed)
11041198 ceph_put_cap(mdsc, cap);
11051199
1106
- /* when reconnect denied, we remove session caps forcibly,
1107
- * i_wr_ref can be non-zero. If there are ongoing write,
1108
- * keep i_snap_realm.
1109
- */
1110
- if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
1111
- drop_inode_snap_realm(ci);
1200
+ if (!__ceph_is_any_real_caps(ci)) {
1201
+ /* when reconnect denied, we remove session caps forcibly,
1202
+ * i_wr_ref can be non-zero. If there are ongoing write,
1203
+ * keep i_snap_realm.
1204
+ */
1205
+ if (ci->i_wr_ref == 0 && ci->i_snap_realm)
1206
+ drop_inode_snap_realm(ci);
11121207
1113
- if (!__ceph_is_any_real_caps(ci))
11141208 __cap_delay_cancel(mdsc, ci);
1209
+ }
11151210 }
11161211
11171212 struct cap_msg_args {
....@@ -1119,8 +1214,10 @@
11191214 u64 ino, cid, follows;
11201215 u64 flush_tid, oldest_flush_tid, size, max_size;
11211216 u64 xattr_version;
1217
+ u64 change_attr;
11221218 struct ceph_buffer *xattr_buf;
1123
- struct timespec64 atime, mtime, ctime;
1219
+ struct ceph_buffer *old_xattr_buf;
1220
+ struct timespec64 atime, mtime, ctime, btime;
11241221 int op, caps, wanted, dirty;
11251222 u32 seq, issue_seq, mseq, time_warp_seq;
11261223 u32 flags;
....@@ -1128,39 +1225,30 @@
11281225 kgid_t gid;
11291226 umode_t mode;
11301227 bool inline_data;
1228
+ bool wake;
11311229 };
11321230
11331231 /*
1134
- * Build and send a cap message to the given MDS.
1135
- *
1136
- * Caller should be holding s_mutex.
1232
+ * cap struct size + flock buffer size + inline version + inline data size +
1233
+ * osd_epoch_barrier + oldest_flush_tid
11371234 */
1138
-static int send_cap_msg(struct cap_msg_args *arg)
1235
+#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
1236
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
1237
+
1238
+/* Marshal up the cap msg to the MDS */
1239
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
11391240 {
11401241 struct ceph_mds_caps *fc;
1141
- struct ceph_msg *msg;
11421242 void *p;
1143
- size_t extra_len;
1144
- struct timespec64 zerotime = {0};
11451243 struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
11461244
1147
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
1148
- " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
1149
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
1150
- arg->cid, arg->ino, ceph_cap_string(arg->caps),
1151
- ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
1152
- arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
1153
- arg->mseq, arg->follows, arg->size, arg->max_size,
1154
- arg->xattr_version,
1245
+ dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
1246
+ __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
1247
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
1248
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
1249
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
1250
+ arg->size, arg->max_size, arg->xattr_version,
11551251 arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
1156
-
1157
- /* flock buffer size + inline version + inline data size +
1158
- * osd_epoch_barrier + oldest_flush_tid */
1159
- extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
1160
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
1161
- GFP_NOFS, false);
1162
- if (!msg)
1163
- return -ENOMEM;
11641252
11651253 msg->hdr.version = cpu_to_le16(10);
11661254 msg->hdr.tid = cpu_to_le64(arg->flush_tid);
....@@ -1226,29 +1314,20 @@
12261314 /* pool namespace (version 8) (mds always ignores this) */
12271315 ceph_encode_32(&p, 0);
12281316
1229
- /*
1230
- * btime and change_attr (version 9)
1231
- *
1232
- * We just zero these out for now, as the MDS ignores them unless
1233
- * the requisite feature flags are set (which we don't do yet).
1234
- */
1235
- ceph_encode_timespec64(p, &zerotime);
1317
+ /* btime and change_attr (version 9) */
1318
+ ceph_encode_timespec64(p, &arg->btime);
12361319 p += sizeof(struct ceph_timespec);
1237
- ceph_encode_64(&p, 0);
1320
+ ceph_encode_64(&p, arg->change_attr);
12381321
12391322 /* Advisory flags (version 10) */
12401323 ceph_encode_32(&p, arg->flags);
1241
-
1242
- ceph_con_send(&arg->session->s_con, msg);
1243
- return 0;
12441324 }
12451325
12461326 /*
12471327 * Queue cap releases when an inode is dropped from our cache.
12481328 */
1249
-void ceph_queue_caps_release(struct inode *inode)
1329
+void __ceph_remove_caps(struct ceph_inode_info *ci)
12501330 {
1251
- struct ceph_inode_info *ci = ceph_inode(inode);
12521331 struct rb_node *p;
12531332
12541333 /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
....@@ -1264,141 +1343,133 @@
12641343 }
12651344
12661345 /*
1267
- * Send a cap msg on the given inode. Update our caps state, then
1268
- * drop i_ceph_lock and send the message.
1346
+ * Prepare to send a cap message to an MDS. Update the cap state, and populate
1347
+ * the arg struct with the parameters that will need to be sent. This should
1348
+ * be done under the i_ceph_lock to guard against changes to cap state.
12691349 *
12701350 * Make note of max_size reported/requested from mds, revoked caps
12711351 * that have now been implemented.
1272
- *
1273
- * Make half-hearted attempt ot to invalidate page cache if we are
1274
- * dropping RDCACHE. Note that this will leave behind locked pages
1275
- * that we'll then need to deal with elsewhere.
1276
- *
1277
- * Return non-zero if delayed release, or we experienced an error
1278
- * such that the caller should requeue + retry later.
1279
- *
1280
- * called with i_ceph_lock, then drops it.
1281
- * caller should hold snap_rwsem (read), s_mutex.
12821352 */
1283
-static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1284
- int op, bool sync, int used, int want, int retain,
1285
- int flushing, u64 flush_tid, u64 oldest_flush_tid)
1286
- __releases(cap->ci->i_ceph_lock)
1353
+static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
1354
+ int op, int flags, int used, int want, int retain,
1355
+ int flushing, u64 flush_tid, u64 oldest_flush_tid)
12871356 {
12881357 struct ceph_inode_info *ci = cap->ci;
12891358 struct inode *inode = &ci->vfs_inode;
1290
- struct ceph_buffer *old_blob = NULL;
1291
- struct cap_msg_args arg;
12921359 int held, revoking;
1293
- int wake = 0;
1294
- int delayed = 0;
1295
- int ret;
1360
+
1361
+ lockdep_assert_held(&ci->i_ceph_lock);
12961362
12971363 held = cap->issued | cap->implemented;
12981364 revoking = cap->implemented & ~cap->issued;
12991365 retain &= ~revoking;
13001366
1301
- dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1302
- inode, cap, cap->session,
1367
+ dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
1368
+ __func__, inode, cap, cap->session,
13031369 ceph_cap_string(held), ceph_cap_string(held & retain),
13041370 ceph_cap_string(revoking));
13051371 BUG_ON((retain & CEPH_CAP_PIN) == 0);
13061372
1307
- arg.session = cap->session;
1308
-
1309
- /* don't release wanted unless we've waited a bit. */
1310
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1311
- time_before(jiffies, ci->i_hold_caps_min)) {
1312
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1313
- ceph_cap_string(cap->issued),
1314
- ceph_cap_string(cap->issued & retain),
1315
- ceph_cap_string(cap->mds_wanted),
1316
- ceph_cap_string(want));
1317
- want |= cap->mds_wanted;
1318
- retain |= cap->issued;
1319
- delayed = 1;
1320
- }
1321
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1322
- if (want & ~cap->mds_wanted) {
1323
- /* user space may open/close single file frequently.
1324
- * This avoids droping mds_wanted immediately after
1325
- * requesting new mds_wanted.
1326
- */
1327
- __cap_set_timeouts(mdsc, ci);
1328
- }
1373
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH;
13291374
13301375 cap->issued &= retain; /* drop bits we don't want */
1331
- if (cap->implemented & ~cap->issued) {
1332
- /*
1333
- * Wake up any waiters on wanted -> needed transition.
1334
- * This is due to the weird transition from buffered
1335
- * to sync IO... we need to flush dirty pages _before_
1336
- * allowing sync writes to avoid reordering.
1337
- */
1338
- wake = 1;
1339
- }
1376
+ /*
1377
+ * Wake up any waiters on wanted -> needed transition. This is due to
1378
+ * the weird transition from buffered to sync IO... we need to flush
1379
+ * dirty pages _before_ allowing sync writes to avoid reordering.
1380
+ */
1381
+ arg->wake = cap->implemented & ~cap->issued;
13401382 cap->implemented &= cap->issued | used;
13411383 cap->mds_wanted = want;
13421384
1343
- arg.ino = ceph_vino(inode).ino;
1344
- arg.cid = cap->cap_id;
1345
- arg.follows = flushing ? ci->i_head_snapc->seq : 0;
1346
- arg.flush_tid = flush_tid;
1347
- arg.oldest_flush_tid = oldest_flush_tid;
1385
+ arg->session = cap->session;
1386
+ arg->ino = ceph_vino(inode).ino;
1387
+ arg->cid = cap->cap_id;
1388
+ arg->follows = flushing ? ci->i_head_snapc->seq : 0;
1389
+ arg->flush_tid = flush_tid;
1390
+ arg->oldest_flush_tid = oldest_flush_tid;
13481391
1349
- arg.size = inode->i_size;
1350
- ci->i_reported_size = arg.size;
1351
- arg.max_size = ci->i_wanted_max_size;
1352
- ci->i_requested_max_size = arg.max_size;
1392
+ arg->size = inode->i_size;
1393
+ ci->i_reported_size = arg->size;
1394
+ arg->max_size = ci->i_wanted_max_size;
1395
+ if (cap == ci->i_auth_cap) {
1396
+ if (want & CEPH_CAP_ANY_FILE_WR)
1397
+ ci->i_requested_max_size = arg->max_size;
1398
+ else
1399
+ ci->i_requested_max_size = 0;
1400
+ }
13531401
13541402 if (flushing & CEPH_CAP_XATTR_EXCL) {
1355
- old_blob = __ceph_build_xattrs_blob(ci);
1356
- arg.xattr_version = ci->i_xattrs.version;
1357
- arg.xattr_buf = ci->i_xattrs.blob;
1403
+ arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
1404
+ arg->xattr_version = ci->i_xattrs.version;
1405
+ arg->xattr_buf = ci->i_xattrs.blob;
13581406 } else {
1359
- arg.xattr_buf = NULL;
1407
+ arg->xattr_buf = NULL;
1408
+ arg->old_xattr_buf = NULL;
13601409 }
13611410
1362
- arg.mtime = inode->i_mtime;
1363
- arg.atime = inode->i_atime;
1364
- arg.ctime = inode->i_ctime;
1411
+ arg->mtime = inode->i_mtime;
1412
+ arg->atime = inode->i_atime;
1413
+ arg->ctime = inode->i_ctime;
1414
+ arg->btime = ci->i_btime;
1415
+ arg->change_attr = inode_peek_iversion_raw(inode);
13651416
1366
- arg.op = op;
1367
- arg.caps = cap->implemented;
1368
- arg.wanted = want;
1369
- arg.dirty = flushing;
1417
+ arg->op = op;
1418
+ arg->caps = cap->implemented;
1419
+ arg->wanted = want;
1420
+ arg->dirty = flushing;
13701421
1371
- arg.seq = cap->seq;
1372
- arg.issue_seq = cap->issue_seq;
1373
- arg.mseq = cap->mseq;
1374
- arg.time_warp_seq = ci->i_time_warp_seq;
1422
+ arg->seq = cap->seq;
1423
+ arg->issue_seq = cap->issue_seq;
1424
+ arg->mseq = cap->mseq;
1425
+ arg->time_warp_seq = ci->i_time_warp_seq;
13751426
1376
- arg.uid = inode->i_uid;
1377
- arg.gid = inode->i_gid;
1378
- arg.mode = inode->i_mode;
1427
+ arg->uid = inode->i_uid;
1428
+ arg->gid = inode->i_gid;
1429
+ arg->mode = inode->i_mode;
13791430
1380
- arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1381
- if (list_empty(&ci->i_cap_snaps))
1382
- arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
1383
- else
1384
- arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1385
- if (sync)
1386
- arg.flags |= CEPH_CLIENT_CAPS_SYNC;
1431
+ arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1432
+ if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1433
+ !list_empty(&ci->i_cap_snaps)) {
1434
+ struct ceph_cap_snap *capsnap;
1435
+ list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1436
+ if (capsnap->cap_flush.tid)
1437
+ break;
1438
+ if (capsnap->need_flush) {
1439
+ flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1440
+ break;
1441
+ }
1442
+ }
1443
+ }
1444
+ arg->flags = flags;
1445
+}
13871446
1388
- spin_unlock(&ci->i_ceph_lock);
1447
+/*
1448
+ * Send a cap msg on the given inode.
1449
+ *
1450
+ * Caller should hold snap_rwsem (read), s_mutex.
1451
+ */
1452
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
1453
+{
1454
+ struct ceph_msg *msg;
1455
+ struct inode *inode = &ci->vfs_inode;
13891456
1390
- ceph_buffer_put(old_blob);
1391
-
1392
- ret = send_cap_msg(&arg);
1393
- if (ret < 0) {
1394
- dout("error sending cap msg, must requeue %p\n", inode);
1395
- delayed = 1;
1457
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
1458
+ if (!msg) {
1459
+ pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
1460
+ ceph_vinop(inode), ceph_cap_string(arg->dirty),
1461
+ arg->flush_tid);
1462
+ spin_lock(&ci->i_ceph_lock);
1463
+ __cap_delay_requeue(arg->session->s_mdsc, ci);
1464
+ spin_unlock(&ci->i_ceph_lock);
1465
+ return;
13961466 }
13971467
1398
- if (wake)
1468
+ encode_cap_msg(msg, arg);
1469
+ ceph_con_send(&arg->session->s_con, msg);
1470
+ ceph_buffer_put(arg->old_xattr_buf);
1471
+ if (arg->wake)
13991472 wake_up_all(&ci->i_cap_wq);
1400
-
1401
- return delayed;
14021473 }
14031474
14041475 static inline int __send_flush_snap(struct inode *inode,
....@@ -1407,6 +1478,11 @@
14071478 u32 mseq, u64 oldest_flush_tid)
14081479 {
14091480 struct cap_msg_args arg;
1481
+ struct ceph_msg *msg;
1482
+
1483
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
1484
+ if (!msg)
1485
+ return -ENOMEM;
14101486
14111487 arg.session = session;
14121488 arg.ino = ceph_vino(inode).ino;
....@@ -1419,10 +1495,13 @@
14191495 arg.max_size = 0;
14201496 arg.xattr_version = capsnap->xattr_version;
14211497 arg.xattr_buf = capsnap->xattr_blob;
1498
+ arg.old_xattr_buf = NULL;
14221499
14231500 arg.atime = capsnap->atime;
14241501 arg.mtime = capsnap->mtime;
14251502 arg.ctime = capsnap->ctime;
1503
+ arg.btime = capsnap->btime;
1504
+ arg.change_attr = capsnap->change_attr;
14261505
14271506 arg.op = CEPH_CAP_OP_FLUSHSNAP;
14281507 arg.caps = capsnap->issued;
....@@ -1440,8 +1519,11 @@
14401519
14411520 arg.inline_data = capsnap->inline_data;
14421521 arg.flags = 0;
1522
+ arg.wake = false;
14431523
1444
- return send_cap_msg(&arg);
1524
+ encode_cap_msg(msg, &arg);
1525
+ ceph_con_send(&arg.session->s_con, msg);
1526
+ return 0;
14451527 }
14461528
14471529 /*
....@@ -1554,6 +1636,7 @@
15541636 struct inode *inode = &ci->vfs_inode;
15551637 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
15561638 struct ceph_mds_session *session = NULL;
1639
+ bool need_put = false;
15571640 int mds;
15581641
15591642 dout("ceph_flush_snaps %p\n", inode);
....@@ -1590,10 +1673,8 @@
15901673 }
15911674
15921675 // make sure flushsnap messages are sent in proper order.
1593
- if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
1676
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
15941677 __kick_flushing_caps(mdsc, session, ci, 0);
1595
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
1596
- }
15971678
15981679 __ceph_flush_snaps(ci, session);
15991680 out:
....@@ -1607,8 +1688,13 @@
16071688 }
16081689 /* we flushed them all; remove this inode from the queue */
16091690 spin_lock(&mdsc->snap_flush_lock);
1691
+ if (!list_empty(&ci->i_snap_flush_item))
1692
+ need_put = true;
16101693 list_del_init(&ci->i_snap_flush_item);
16111694 spin_unlock(&mdsc->snap_flush_lock);
1695
+
1696
+ if (need_put)
1697
+ iput(inode);
16121698 }
16131699
16141700 /*
....@@ -1625,6 +1711,8 @@
16251711 int was = ci->i_dirty_caps;
16261712 int dirty = 0;
16271713
1714
+ lockdep_assert_held(&ci->i_ceph_lock);
1715
+
16281716 if (!ci->i_auth_cap) {
16291717 pr_warn("__mark_dirty_caps %p %llx mask %s, "
16301718 "but no auth cap (session was closed?)\n",
....@@ -1637,6 +1725,8 @@
16371725 ceph_cap_string(was | mask));
16381726 ci->i_dirty_caps |= mask;
16391727 if (was == 0) {
1728
+ struct ceph_mds_session *session = ci->i_auth_cap->session;
1729
+
16401730 WARN_ON_ONCE(ci->i_prealloc_cap_flush);
16411731 swap(ci->i_prealloc_cap_flush, *pcf);
16421732
....@@ -1649,7 +1739,7 @@
16491739 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
16501740 BUG_ON(!list_empty(&ci->i_dirty_item));
16511741 spin_lock(&mdsc->cap_dirty_lock);
1652
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1742
+ list_add(&ci->i_dirty_item, &session->s_cap_dirty);
16531743 spin_unlock(&mdsc->cap_dirty_lock);
16541744 if (ci->i_flushing_caps == 0) {
16551745 ihold(inode);
....@@ -1668,7 +1758,14 @@
16681758
16691759 struct ceph_cap_flush *ceph_alloc_cap_flush(void)
16701760 {
1671
- return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
1761
+ struct ceph_cap_flush *cf;
1762
+
1763
+ cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
1764
+ if (!cf)
1765
+ return NULL;
1766
+
1767
+ cf->is_capsnap = false;
1768
+ return cf;
16721769 }
16731770
16741771 void ceph_free_cap_flush(struct ceph_cap_flush *cf)
....@@ -1692,30 +1789,33 @@
16921789 * Remove cap_flush from the mdsc's or inode's flushing cap list.
16931790 * Return true if caller needs to wake up flush waiters.
16941791 */
1695
-static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
1696
- struct ceph_inode_info *ci,
1697
- struct ceph_cap_flush *cf)
1792
+static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1793
+ struct ceph_cap_flush *cf)
16981794 {
16991795 struct ceph_cap_flush *prev;
17001796 bool wake = cf->wake;
1701
- if (mdsc) {
1702
- /* are there older pending cap flushes? */
1703
- if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1704
- prev = list_prev_entry(cf, g_list);
1705
- prev->wake = true;
1706
- wake = false;
1707
- }
1708
- list_del(&cf->g_list);
1709
- } else if (ci) {
1710
- if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1711
- prev = list_prev_entry(cf, i_list);
1712
- prev->wake = true;
1713
- wake = false;
1714
- }
1715
- list_del(&cf->i_list);
1716
- } else {
1717
- BUG_ON(1);
1797
+
1798
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1799
+ prev = list_prev_entry(cf, g_list);
1800
+ prev->wake = true;
1801
+ wake = false;
17181802 }
1803
+ list_del_init(&cf->g_list);
1804
+ return wake;
1805
+}
1806
+
1807
+static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1808
+ struct ceph_cap_flush *cf)
1809
+{
1810
+ struct ceph_cap_flush *prev;
1811
+ bool wake = cf->wake;
1812
+
1813
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1814
+ prev = list_prev_entry(cf, i_list);
1815
+ prev->wake = true;
1816
+ wake = false;
1817
+ }
1818
+ list_del_init(&cf->i_list);
17191819 return wake;
17201820 }
17211821
....@@ -1723,17 +1823,18 @@
17231823 * Add dirty inode to the flushing list. Assigned a seq number so we
17241824 * can wait for caps to flush without starving.
17251825 *
1726
- * Called under i_ceph_lock.
1826
+ * Called under i_ceph_lock. Returns the flush tid.
17271827 */
1728
-static int __mark_caps_flushing(struct inode *inode,
1828
+static u64 __mark_caps_flushing(struct inode *inode,
17291829 struct ceph_mds_session *session, bool wake,
1730
- u64 *flush_tid, u64 *oldest_flush_tid)
1830
+ u64 *oldest_flush_tid)
17311831 {
17321832 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
17331833 struct ceph_inode_info *ci = ceph_inode(inode);
17341834 struct ceph_cap_flush *cf = NULL;
17351835 int flushing;
17361836
1837
+ lockdep_assert_held(&ci->i_ceph_lock);
17371838 BUG_ON(ci->i_dirty_caps == 0);
17381839 BUG_ON(list_empty(&ci->i_dirty_item));
17391840 BUG_ON(!ci->i_prealloc_cap_flush);
....@@ -1766,8 +1867,7 @@
17661867
17671868 list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
17681869
1769
- *flush_tid = cf->tid;
1770
- return flushing;
1870
+ return cf->tid;
17711871 }
17721872
17731873 /*
....@@ -1817,8 +1917,6 @@
18171917 * versus held caps. Release, flush, ack revoked caps to mds as
18181918 * appropriate.
18191919 *
1820
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1821
- * cap release further.
18221920 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
18231921 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
18241922 * further delay.
....@@ -1826,9 +1924,8 @@
18261924 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
18271925 struct ceph_mds_session *session)
18281926 {
1829
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1830
- struct ceph_mds_client *mdsc = fsc->mdsc;
18311927 struct inode *inode = &ci->vfs_inode;
1928
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
18321929 struct ceph_cap *cap;
18331930 u64 flush_tid, oldest_flush_tid;
18341931 int file_wanted, used, cap_used;
....@@ -1837,48 +1934,53 @@
18371934 int mds = -1; /* keep track of how far we've gone through i_caps list
18381935 to avoid an infinite loop on retry */
18391936 struct rb_node *p;
1840
- int delayed = 0, sent = 0;
1841
- bool no_delay = flags & CHECK_CAPS_NODELAY;
18421937 bool queue_invalidate = false;
18431938 bool tried_invalidate = false;
18441939
1845
- /* if we are unmounting, flush any unused caps immediately. */
1846
- if (mdsc->stopping)
1847
- no_delay = true;
1848
-
18491940 spin_lock(&ci->i_ceph_lock);
1850
-
18511941 if (ci->i_ceph_flags & CEPH_I_FLUSH)
18521942 flags |= CHECK_CAPS_FLUSH;
1853
-
1854
- if (!(flags & CHECK_CAPS_AUTHONLY) ||
1855
- (ci->i_auth_cap && __ceph_is_single_caps(ci)))
1856
- __cap_delay_cancel(mdsc, ci);
18571943
18581944 goto retry_locked;
18591945 retry:
18601946 spin_lock(&ci->i_ceph_lock);
18611947 retry_locked:
1948
+ /* Caps wanted by virtue of active open files. */
18621949 file_wanted = __ceph_caps_file_wanted(ci);
1950
+
1951
+ /* Caps which have active references against them */
18631952 used = __ceph_caps_used(ci);
1953
+
1954
+ /*
1955
+ * "issued" represents the current caps that the MDS wants us to have.
1956
+ * "implemented" is the set that we have been granted, and includes the
1957
+ * ones that have not yet been returned to the MDS (the "revoking" set,
1958
+ * usually because they have outstanding references).
1959
+ */
18641960 issued = __ceph_caps_issued(ci, &implemented);
18651961 revoking = implemented & ~issued;
18661962
18671963 want = file_wanted;
1964
+
1965
+ /* The ones we currently want to retain (may be adjusted below) */
18681966 retain = file_wanted | used | CEPH_CAP_PIN;
18691967 if (!mdsc->stopping && inode->i_nlink > 0) {
18701968 if (file_wanted) {
18711969 retain |= CEPH_CAP_ANY; /* be greedy */
18721970 } else if (S_ISDIR(inode->i_mode) &&
18731971 (issued & CEPH_CAP_FILE_SHARED) &&
1874
- __ceph_dir_is_complete(ci)) {
1972
+ __ceph_dir_is_complete(ci)) {
18751973 /*
18761974 * If a directory is complete, we want to keep
18771975 * the exclusive cap. So that MDS does not end up
18781976 * revoking the shared cap on every create/unlink
18791977 * operation.
18801978 */
1881
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1979
+ if (IS_RDONLY(inode)) {
1980
+ want = CEPH_CAP_ANY_SHARED;
1981
+ } else {
1982
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1983
+ }
18821984 retain |= want;
18831985 } else {
18841986
....@@ -1894,14 +1996,13 @@
18941996 }
18951997
18961998 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1897
- " issued %s revoking %s retain %s %s%s%s\n", inode,
1999
+ " issued %s revoking %s retain %s %s%s\n", inode,
18982000 ceph_cap_string(file_wanted),
18992001 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
19002002 ceph_cap_string(ci->i_flushing_caps),
19012003 ceph_cap_string(issued), ceph_cap_string(revoking),
19022004 ceph_cap_string(retain),
19032005 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1904
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
19052006 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
19062007
19072008 /*
....@@ -1909,8 +2010,8 @@
19092010 * have cached pages, but don't want them, then try to invalidate.
19102011 * If we fail, it's because pages are locked.... try again later.
19112012 */
1912
- if ((!no_delay || mdsc->stopping) &&
1913
- !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
2013
+ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
2014
+ S_ISREG(inode->i_mode) &&
19142015 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
19152016 inode->i_data.nrpages && /* have cached pages */
19162017 (revoking & (CEPH_CAP_FILE_CACHE|
....@@ -1927,6 +2028,9 @@
19272028 }
19282029
19292030 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2031
+ int mflags = 0;
2032
+ struct cap_msg_args arg;
2033
+
19302034 cap = rb_entry(p, struct ceph_cap, ci_node);
19312035
19322036 /* avoid looping forever */
....@@ -1936,6 +2040,10 @@
19362040
19372041 /* NOTE: no side-effects allowed, until we take s_mutex */
19382042
2043
+ /*
2044
+ * If we have an auth cap, we don't need to consider any
2045
+ * overlapping caps as used.
2046
+ */
19392047 cap_used = used;
19402048 if (ci->i_auth_cap && cap != ci->i_auth_cap)
19412049 cap_used &= ~ci->i_auth_cap->issued;
....@@ -1990,31 +2098,10 @@
19902098 }
19912099
19922100 /* things we might delay */
1993
- if ((cap->issued & ~retain) == 0 &&
1994
- cap->mds_wanted == want)
2101
+ if ((cap->issued & ~retain) == 0)
19952102 continue; /* nope, all good */
19962103
1997
- if (no_delay)
1998
- goto ack;
1999
-
2000
- /* delay? */
2001
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
2002
- time_before(jiffies, ci->i_hold_caps_max)) {
2003
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
2004
- ceph_cap_string(cap->issued),
2005
- ceph_cap_string(cap->issued & retain),
2006
- ceph_cap_string(cap->mds_wanted),
2007
- ceph_cap_string(want));
2008
- delayed++;
2009
- continue;
2010
- }
2011
-
20122104 ack:
2013
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2014
- dout(" skipping %p I_NOFLUSH set\n", inode);
2015
- continue;
2016
- }
2017
-
20182105 if (session && session != cap->session) {
20192106 dout("oops, wrong session %p mutex\n", session);
20202107 mutex_unlock(&session->s_mutex);
....@@ -2052,10 +2139,8 @@
20522139 if (cap == ci->i_auth_cap &&
20532140 (ci->i_ceph_flags &
20542141 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
2055
- if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2142
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
20562143 __kick_flushing_caps(mdsc, session, ci, 0);
2057
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2058
- }
20592144 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
20602145 __ceph_flush_snaps(ci, session);
20612146
....@@ -2076,9 +2161,12 @@
20762161 }
20772162
20782163 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
2079
- flushing = __mark_caps_flushing(inode, session, false,
2080
- &flush_tid,
2081
- &oldest_flush_tid);
2164
+ flushing = ci->i_dirty_caps;
2165
+ flush_tid = __mark_caps_flushing(inode, session, false,
2166
+ &oldest_flush_tid);
2167
+ if (flags & CHECK_CAPS_FLUSH &&
2168
+ list_empty(&session->s_cap_dirty))
2169
+ mflags |= CEPH_CLIENT_CAPS_SYNC;
20822170 } else {
20832171 flushing = 0;
20842172 flush_tid = 0;
....@@ -2088,18 +2176,23 @@
20882176 }
20892177
20902178 mds = cap->mds; /* remember mds, so we don't repeat */
2091
- sent++;
20922179
2093
- /* __send_cap drops i_ceph_lock */
2094
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
2095
- cap_used, want, retain, flushing,
2096
- flush_tid, oldest_flush_tid);
2180
+ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
2181
+ want, retain, flushing, flush_tid, oldest_flush_tid);
2182
+ spin_unlock(&ci->i_ceph_lock);
2183
+
2184
+ __send_cap(&arg, ci);
2185
+
20972186 goto retry; /* retake i_ceph_lock and restart our cap scan. */
20982187 }
20992188
2100
- /* Reschedule delayed caps release if we delayed anything */
2101
- if (delayed)
2189
+ /* periodically re-calculate caps wanted by open files */
2190
+ if (__ceph_is_any_real_caps(ci) &&
2191
+ list_empty(&ci->i_cap_delay_list) &&
2192
+ (file_wanted & ~CEPH_CAP_PIN) &&
2193
+ !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
21022194 __cap_delay_requeue(mdsc, ci);
2195
+ }
21032196
21042197 spin_unlock(&ci->i_ceph_lock);
21052198
....@@ -2125,18 +2218,12 @@
21252218
21262219 retry:
21272220 spin_lock(&ci->i_ceph_lock);
2128
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2129
- spin_unlock(&ci->i_ceph_lock);
2130
- dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
2131
- goto out;
2132
- }
2221
+retry_locked:
21332222 if (ci->i_dirty_caps && ci->i_auth_cap) {
21342223 struct ceph_cap *cap = ci->i_auth_cap;
2135
- int used = __ceph_caps_used(ci);
2136
- int want = __ceph_caps_wanted(ci);
2137
- int delayed;
2224
+ struct cap_msg_args arg;
21382225
2139
- if (!session || session != cap->session) {
2226
+ if (session != cap->session) {
21402227 spin_unlock(&ci->i_ceph_lock);
21412228 if (session)
21422229 mutex_unlock(&session->s_mutex);
....@@ -2149,19 +2236,26 @@
21492236 goto out;
21502237 }
21512238
2152
- flushing = __mark_caps_flushing(inode, session, true,
2153
- &flush_tid, &oldest_flush_tid);
2154
-
2155
- /* __send_cap drops i_ceph_lock */
2156
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
2157
- used, want, (cap->issued | cap->implemented),
2158
- flushing, flush_tid, oldest_flush_tid);
2159
-
2160
- if (delayed) {
2161
- spin_lock(&ci->i_ceph_lock);
2162
- __cap_delay_requeue(mdsc, ci);
2163
- spin_unlock(&ci->i_ceph_lock);
2239
+ if (ci->i_ceph_flags &
2240
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
2241
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2242
+ __kick_flushing_caps(mdsc, session, ci, 0);
2243
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2244
+ __ceph_flush_snaps(ci, session);
2245
+ goto retry_locked;
21642246 }
2247
+
2248
+ flushing = ci->i_dirty_caps;
2249
+ flush_tid = __mark_caps_flushing(inode, session, true,
2250
+ &oldest_flush_tid);
2251
+
2252
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
2253
+ __ceph_caps_used(ci), __ceph_caps_wanted(ci),
2254
+ (cap->issued | cap->implemented),
2255
+ flushing, flush_tid, oldest_flush_tid);
2256
+ spin_unlock(&ci->i_ceph_lock);
2257
+
2258
+ __send_cap(&arg, ci);
21652259 } else {
21662260 if (!list_empty(&ci->i_cap_flush_list)) {
21672261 struct ceph_cap_flush *cf =
....@@ -2206,6 +2300,7 @@
22062300 */
22072301 static int unsafe_request_wait(struct inode *inode)
22082302 {
2303
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
22092304 struct ceph_inode_info *ci = ceph_inode(inode);
22102305 struct ceph_mds_request *req1 = NULL, *req2 = NULL;
22112306 int ret, err = 0;
....@@ -2225,6 +2320,76 @@
22252320 }
22262321 spin_unlock(&ci->i_unsafe_lock);
22272322
2323
+ /*
2324
+ * Trigger to flush the journal logs in all the relevant MDSes
2325
+ * manually, or in the worst case we must wait at most 5 seconds
2326
+ * to wait the journal logs to be flushed by the MDSes periodically.
2327
+ */
2328
+ if (req1 || req2) {
2329
+ struct ceph_mds_request *req;
2330
+ struct ceph_mds_session **sessions;
2331
+ struct ceph_mds_session *s;
2332
+ unsigned int max_sessions;
2333
+ int i;
2334
+
2335
+ mutex_lock(&mdsc->mutex);
2336
+ max_sessions = mdsc->max_sessions;
2337
+
2338
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
2339
+ if (!sessions) {
2340
+ mutex_unlock(&mdsc->mutex);
2341
+ err = -ENOMEM;
2342
+ goto out;
2343
+ }
2344
+
2345
+ spin_lock(&ci->i_unsafe_lock);
2346
+ if (req1) {
2347
+ list_for_each_entry(req, &ci->i_unsafe_dirops,
2348
+ r_unsafe_dir_item) {
2349
+ s = req->r_session;
2350
+ if (!s)
2351
+ continue;
2352
+ if (!sessions[s->s_mds]) {
2353
+ s = ceph_get_mds_session(s);
2354
+ sessions[s->s_mds] = s;
2355
+ }
2356
+ }
2357
+ }
2358
+ if (req2) {
2359
+ list_for_each_entry(req, &ci->i_unsafe_iops,
2360
+ r_unsafe_target_item) {
2361
+ s = req->r_session;
2362
+ if (!s)
2363
+ continue;
2364
+ if (!sessions[s->s_mds]) {
2365
+ s = ceph_get_mds_session(s);
2366
+ sessions[s->s_mds] = s;
2367
+ }
2368
+ }
2369
+ }
2370
+ spin_unlock(&ci->i_unsafe_lock);
2371
+
2372
+ /* the auth MDS */
2373
+ spin_lock(&ci->i_ceph_lock);
2374
+ if (ci->i_auth_cap) {
2375
+ s = ci->i_auth_cap->session;
2376
+ if (!sessions[s->s_mds])
2377
+ sessions[s->s_mds] = ceph_get_mds_session(s);
2378
+ }
2379
+ spin_unlock(&ci->i_ceph_lock);
2380
+ mutex_unlock(&mdsc->mutex);
2381
+
2382
+ /* send flush mdlog request to MDSes */
2383
+ for (i = 0; i < max_sessions; i++) {
2384
+ s = sessions[i];
2385
+ if (s) {
2386
+ send_flush_mdlog(s);
2387
+ ceph_put_mds_session(s);
2388
+ }
2389
+ }
2390
+ kfree(sessions);
2391
+ }
2392
+
22282393 dout("unsafe_request_wait %p wait on tid %llu %llu\n",
22292394 inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
22302395 if (req1) {
....@@ -2232,15 +2397,19 @@
22322397 ceph_timeout_jiffies(req1->r_timeout));
22332398 if (ret)
22342399 err = -EIO;
2235
- ceph_mdsc_put_request(req1);
22362400 }
22372401 if (req2) {
22382402 ret = !wait_for_completion_timeout(&req2->r_safe_completion,
22392403 ceph_timeout_jiffies(req2->r_timeout));
22402404 if (ret)
22412405 err = -EIO;
2242
- ceph_mdsc_put_request(req2);
22432406 }
2407
+
2408
+out:
2409
+ if (req1)
2410
+ ceph_mdsc_put_request(req1);
2411
+ if (req2)
2412
+ ceph_mdsc_put_request(req2);
22442413 return err;
22452414 }
22462415
....@@ -2249,35 +2418,40 @@
22492418 struct inode *inode = file->f_mapping->host;
22502419 struct ceph_inode_info *ci = ceph_inode(inode);
22512420 u64 flush_tid;
2252
- int ret;
2421
+ int ret, err;
22532422 int dirty;
22542423
22552424 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
22562425
22572426 ret = file_write_and_wait_range(file, start, end);
2258
- if (ret < 0)
2259
- goto out;
2260
-
22612427 if (datasync)
22622428 goto out;
22632429
2264
- inode_lock(inode);
2430
+ ret = ceph_wait_on_async_create(inode);
2431
+ if (ret)
2432
+ goto out;
22652433
22662434 dirty = try_flush_caps(inode, &flush_tid);
22672435 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
22682436
2269
- ret = unsafe_request_wait(inode);
2437
+ err = unsafe_request_wait(inode);
22702438
22712439 /*
22722440 * only wait on non-file metadata writeback (the mds
22732441 * can recover size and mtime, so we don't need to
22742442 * wait for that)
22752443 */
2276
- if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2277
- ret = wait_event_interruptible(ci->i_cap_wq,
2444
+ if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2445
+ err = wait_event_interruptible(ci->i_cap_wq,
22782446 caps_are_flushed(inode, flush_tid));
22792447 }
2280
- inode_unlock(inode);
2448
+
2449
+ if (err < 0)
2450
+ ret = err;
2451
+
2452
+ err = file_check_and_advance_wb_err(file);
2453
+ if (err < 0)
2454
+ ret = err;
22812455 out:
22822456 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
22832457 return ret;
....@@ -2327,6 +2501,16 @@
23272501 struct ceph_cap_flush *cf;
23282502 int ret;
23292503 u64 first_tid = 0;
2504
+ u64 last_snap_flush = 0;
2505
+
2506
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2507
+
2508
+ list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2509
+ if (cf->is_capsnap) {
2510
+ last_snap_flush = cf->tid;
2511
+ break;
2512
+ }
2513
+ }
23302514
23312515 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
23322516 if (cf->tid < first_tid)
....@@ -2341,22 +2525,20 @@
23412525
23422526 first_tid = cf->tid + 1;
23432527
2344
- if (cf->caps) {
2528
+ if (!cf->is_capsnap) {
2529
+ struct cap_msg_args arg;
2530
+
23452531 dout("kick_flushing_caps %p cap %p tid %llu %s\n",
23462532 inode, cap, cf->tid, ceph_cap_string(cf->caps));
2347
- ci->i_ceph_flags |= CEPH_I_NODELAY;
2348
- ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
2349
- false, __ceph_caps_used(ci),
2533
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
2534
+ (cf->tid < last_snap_flush ?
2535
+ CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
2536
+ __ceph_caps_used(ci),
23502537 __ceph_caps_wanted(ci),
2351
- cap->issued | cap->implemented,
2538
+ (cap->issued | cap->implemented),
23522539 cf->caps, cf->tid, oldest_flush_tid);
2353
- if (ret) {
2354
- pr_err("kick_flushing_caps: error sending "
2355
- "cap flush, ino (%llx.%llx) "
2356
- "tid %llu flushing %s\n",
2357
- ceph_vinop(inode), cf->tid,
2358
- ceph_cap_string(cf->caps));
2359
- }
2540
+ spin_unlock(&ci->i_ceph_lock);
2541
+ __send_cap(&arg, ci);
23602542 } else {
23612543 struct ceph_cap_snap *capsnap =
23622544 container_of(cf, struct ceph_cap_snap,
....@@ -2417,7 +2599,12 @@
24172599 */
24182600 if ((cap->issued & ci->i_flushing_caps) !=
24192601 ci->i_flushing_caps) {
2420
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2602
+ /* encode_caps_cb() also will reset these sequence
2603
+ * numbers. make sure sequence numbers in cap flush
2604
+ * message match later reconnect message */
2605
+ cap->seq = 0;
2606
+ cap->issue_seq = 0;
2607
+ cap->mseq = 0;
24212608 __kick_flushing_caps(mdsc, session, ci,
24222609 oldest_flush_tid);
24232610 } else {
....@@ -2435,6 +2622,8 @@
24352622 struct ceph_cap *cap;
24362623 u64 oldest_flush_tid;
24372624
2625
+ lockdep_assert_held(&session->s_mutex);
2626
+
24382627 dout("kick_flushing_caps mds%d\n", session->s_mds);
24392628
24402629 spin_lock(&mdsc->cap_dirty_lock);
....@@ -2451,7 +2640,6 @@
24512640 continue;
24522641 }
24532642 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2454
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
24552643 __kick_flushing_caps(mdsc, session, ci,
24562644 oldest_flush_tid);
24572645 }
....@@ -2459,16 +2647,15 @@
24592647 }
24602648 }
24612649
2462
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
2463
- struct ceph_mds_session *session,
2464
- struct inode *inode)
2465
- __releases(ci->i_ceph_lock)
2650
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2651
+ struct ceph_inode_info *ci)
24662652 {
2467
- struct ceph_inode_info *ci = ceph_inode(inode);
2468
- struct ceph_cap *cap;
2653
+ struct ceph_mds_client *mdsc = session->s_mdsc;
2654
+ struct ceph_cap *cap = ci->i_auth_cap;
24692655
2470
- cap = ci->i_auth_cap;
2471
- dout("kick_flushing_inode_caps %p flushing %s\n", inode,
2656
+ lockdep_assert_held(&ci->i_ceph_lock);
2657
+
2658
+ dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
24722659 ceph_cap_string(ci->i_flushing_caps));
24732660
24742661 if (!list_empty(&ci->i_cap_flush_list)) {
....@@ -2479,11 +2666,7 @@
24792666 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
24802667 spin_unlock(&mdsc->cap_dirty_lock);
24812668
2482
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
24832669 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2484
- spin_unlock(&ci->i_ceph_lock);
2485
- } else {
2486
- spin_unlock(&ci->i_ceph_lock);
24872670 }
24882671 }
24892672
....@@ -2491,18 +2674,20 @@
24912674 /*
24922675 * Take references to capabilities we hold, so that we don't release
24932676 * them to the MDS prematurely.
2494
- *
2495
- * Protected by i_ceph_lock.
24962677 */
2497
-static void __take_cap_refs(struct ceph_inode_info *ci, int got,
2678
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
24982679 bool snap_rwsem_locked)
24992680 {
2681
+ lockdep_assert_held(&ci->i_ceph_lock);
2682
+
25002683 if (got & CEPH_CAP_PIN)
25012684 ci->i_pin_ref++;
25022685 if (got & CEPH_CAP_FILE_RD)
25032686 ci->i_rd_ref++;
25042687 if (got & CEPH_CAP_FILE_CACHE)
25052688 ci->i_rdcache_ref++;
2689
+ if (got & CEPH_CAP_FILE_EXCL)
2690
+ ci->i_fx_ref++;
25062691 if (got & CEPH_CAP_FILE_WR) {
25072692 if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
25082693 BUG_ON(!snap_rwsem_locked);
....@@ -2515,7 +2700,7 @@
25152700 if (ci->i_wb_ref == 0)
25162701 ihold(&ci->vfs_inode);
25172702 ci->i_wb_ref++;
2518
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
2703
+ dout("%s %p wb %d -> %d (?)\n", __func__,
25192704 &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
25202705 }
25212706 }
....@@ -2526,15 +2711,26 @@
25262711 * to (when applicable), and check against max_size here as well.
25272712 * Note that caller is responsible for ensuring max_size increases are
25282713 * requested from the MDS.
2714
+ *
2715
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2716
+ * or a negative error code. There are 3 speical error codes:
2717
+ * -EAGAIN: need to sleep but non-blocking is specified
2718
+ * -EFBIG: ask caller to call check_max_size() and try again.
2719
+ * -ESTALE: ask caller to call ceph_renew_caps() and try again.
25292720 */
2530
-static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2531
- loff_t endoff, bool nonblock, int *got, int *err)
2721
+enum {
2722
+ /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
2723
+ NON_BLOCKING = (1 << 8),
2724
+ CHECK_FILELOCK = (1 << 9),
2725
+};
2726
+
2727
+static int try_get_cap_refs(struct inode *inode, int need, int want,
2728
+ loff_t endoff, int flags, int *got)
25322729 {
2533
- struct inode *inode = &ci->vfs_inode;
2730
+ struct ceph_inode_info *ci = ceph_inode(inode);
25342731 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
25352732 int ret = 0;
25362733 int have, implemented;
2537
- int file_wanted;
25382734 bool snap_rwsem_locked = false;
25392735
25402736 dout("get_cap_refs %p need %s want %s\n", inode,
....@@ -2543,13 +2739,10 @@
25432739 again:
25442740 spin_lock(&ci->i_ceph_lock);
25452741
2546
- /* make sure file is actually open */
2547
- file_wanted = __ceph_caps_file_wanted(ci);
2548
- if ((file_wanted & need) != need) {
2549
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
2550
- ceph_cap_string(need), ceph_cap_string(file_wanted));
2551
- *err = -EBADF;
2552
- ret = 1;
2742
+ if ((flags & CHECK_FILELOCK) &&
2743
+ (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2744
+ dout("try_get_cap_refs %p error filelock\n", inode);
2745
+ ret = -EIO;
25532746 goto out_unlock;
25542747 }
25552748
....@@ -2570,10 +2763,8 @@
25702763 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
25712764 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
25722765 inode, endoff, ci->i_max_size);
2573
- if (endoff > ci->i_requested_max_size) {
2574
- *err = -EAGAIN;
2575
- ret = 1;
2576
- }
2766
+ if (endoff > ci->i_requested_max_size)
2767
+ ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
25772768 goto out_unlock;
25782769 }
25792770 /*
....@@ -2607,9 +2798,8 @@
26072798 * we can not call down_read() when
26082799 * task isn't in TASK_RUNNING state
26092800 */
2610
- if (nonblock) {
2611
- *err = -EAGAIN;
2612
- ret = 1;
2801
+ if (flags & NON_BLOCKING) {
2802
+ ret = -EAGAIN;
26132803 goto out_unlock;
26142804 }
26152805
....@@ -2620,57 +2810,63 @@
26202810 }
26212811 snap_rwsem_locked = true;
26222812 }
2623
- *got = need | (have & want);
2624
- if ((need & CEPH_CAP_FILE_RD) &&
2813
+ if ((have & want) == want)
2814
+ *got = need | want;
2815
+ else
2816
+ *got = need;
2817
+ if (S_ISREG(inode->i_mode) &&
2818
+ (need & CEPH_CAP_FILE_RD) &&
26252819 !(*got & CEPH_CAP_FILE_CACHE))
26262820 ceph_disable_fscache_readpage(ci);
2627
- __take_cap_refs(ci, *got, true);
2821
+ ceph_take_cap_refs(ci, *got, true);
26282822 ret = 1;
26292823 }
26302824 } else {
26312825 int session_readonly = false;
2632
- if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
2826
+ int mds_wanted;
2827
+ if (ci->i_auth_cap &&
2828
+ (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
26332829 struct ceph_mds_session *s = ci->i_auth_cap->session;
26342830 spin_lock(&s->s_cap_lock);
26352831 session_readonly = s->s_readonly;
26362832 spin_unlock(&s->s_cap_lock);
26372833 }
26382834 if (session_readonly) {
2639
- dout("get_cap_refs %p needed %s but mds%d readonly\n",
2835
+ dout("get_cap_refs %p need %s but mds%d readonly\n",
26402836 inode, ceph_cap_string(need), ci->i_auth_cap->mds);
2641
- *err = -EROFS;
2642
- ret = 1;
2837
+ ret = -EROFS;
26432838 goto out_unlock;
26442839 }
26452840
2646
- if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
2647
- int mds_wanted;
2648
- if (READ_ONCE(mdsc->fsc->mount_state) ==
2649
- CEPH_MOUNT_SHUTDOWN) {
2650
- dout("get_cap_refs %p forced umount\n", inode);
2651
- *err = -EIO;
2652
- ret = 1;
2653
- goto out_unlock;
2654
- }
2655
- mds_wanted = __ceph_caps_mds_wanted(ci, false);
2656
- if (need & ~(mds_wanted & need)) {
2657
- dout("get_cap_refs %p caps were dropped"
2658
- " (session killed?)\n", inode);
2659
- *err = -ESTALE;
2660
- ret = 1;
2661
- goto out_unlock;
2662
- }
2663
- if (!(file_wanted & ~mds_wanted))
2664
- ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
2841
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
2842
+ dout("get_cap_refs %p forced umount\n", inode);
2843
+ ret = -EIO;
2844
+ goto out_unlock;
2845
+ }
2846
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
2847
+ if (need & ~mds_wanted) {
2848
+ dout("get_cap_refs %p need %s > mds_wanted %s\n",
2849
+ inode, ceph_cap_string(need),
2850
+ ceph_cap_string(mds_wanted));
2851
+ ret = -ESTALE;
2852
+ goto out_unlock;
26652853 }
26662854
2667
- dout("get_cap_refs %p have %s needed %s\n", inode,
2855
+ dout("get_cap_refs %p have %s need %s\n", inode,
26682856 ceph_cap_string(have), ceph_cap_string(need));
26692857 }
26702858 out_unlock:
2859
+
2860
+ __ceph_touch_fmode(ci, mdsc, flags);
2861
+
26712862 spin_unlock(&ci->i_ceph_lock);
26722863 if (snap_rwsem_locked)
26732864 up_read(&mdsc->snap_rwsem);
2865
+
2866
+ if (!ret)
2867
+ ceph_update_cap_mis(&mdsc->metric);
2868
+ else if (ret == 1)
2869
+ ceph_update_cap_hit(&mdsc->metric);
26742870
26752871 dout("get_cap_refs %p ret %d got %s\n", inode,
26762872 ret, ceph_cap_string(*got));
....@@ -2705,24 +2901,39 @@
27052901 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
27062902 }
27072903
2708
-int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
2904
+static inline int get_used_fmode(int caps)
27092905 {
2710
- int ret, err = 0;
2906
+ int fmode = 0;
2907
+ if (caps & CEPH_CAP_FILE_RD)
2908
+ fmode |= CEPH_FILE_MODE_RD;
2909
+ if (caps & CEPH_CAP_FILE_WR)
2910
+ fmode |= CEPH_FILE_MODE_WR;
2911
+ return fmode;
2912
+}
2913
+
2914
+int ceph_try_get_caps(struct inode *inode, int need, int want,
2915
+ bool nonblock, int *got)
2916
+{
2917
+ int ret, flags;
27112918
27122919 BUG_ON(need & ~CEPH_CAP_FILE_RD);
2713
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
2714
- ret = ceph_pool_perm_check(ci, need);
2715
- if (ret < 0)
2716
- return ret;
2717
-
2718
- ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
2719
- if (ret) {
2720
- if (err == -EAGAIN) {
2721
- ret = 0;
2722
- } else if (err < 0) {
2723
- ret = err;
2724
- }
2920
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
2921
+ CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2922
+ CEPH_CAP_ANY_DIR_OPS));
2923
+ if (need) {
2924
+ ret = ceph_pool_perm_check(inode, need);
2925
+ if (ret < 0)
2926
+ return ret;
27252927 }
2928
+
2929
+ flags = get_used_fmode(need | want);
2930
+ if (nonblock)
2931
+ flags |= NON_BLOCKING;
2932
+
2933
+ ret = try_get_cap_refs(inode, need, want, 0, flags, got);
2934
+ /* three special error codes */
2935
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE)
2936
+ ret = 0;
27262937 return ret;
27272938 }
27282939
....@@ -2731,34 +2942,54 @@
27312942 * due to a small max_size, make sure we check_max_size (and possibly
27322943 * ask the mds) so we don't get hung up indefinitely.
27332944 */
2734
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2945
+int ceph_get_caps(struct file *filp, int need, int want,
27352946 loff_t endoff, int *got, struct page **pinned_page)
27362947 {
2737
- int _got, ret, err = 0;
2948
+ struct ceph_file_info *fi = filp->private_data;
2949
+ struct inode *inode = file_inode(filp);
2950
+ struct ceph_inode_info *ci = ceph_inode(inode);
2951
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2952
+ int ret, _got, flags;
27382953
2739
- ret = ceph_pool_perm_check(ci, need);
2954
+ ret = ceph_pool_perm_check(inode, need);
27402955 if (ret < 0)
27412956 return ret;
27422957
2743
- while (true) {
2744
- if (endoff > 0)
2745
- check_max_size(&ci->vfs_inode, endoff);
2958
+ if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2959
+ fi->filp_gen != READ_ONCE(fsc->filp_gen))
2960
+ return -EBADF;
27462961
2747
- err = 0;
2962
+ flags = get_used_fmode(need | want);
2963
+
2964
+ while (true) {
2965
+ flags &= CEPH_FILE_MODE_MASK;
2966
+ if (vfs_inode_has_locks(inode))
2967
+ flags |= CHECK_FILELOCK;
27482968 _got = 0;
2749
- ret = try_get_cap_refs(ci, need, want, endoff,
2750
- false, &_got, &err);
2751
- if (ret) {
2752
- if (err == -EAGAIN)
2753
- continue;
2754
- if (err < 0)
2755
- ret = err;
2756
- } else {
2969
+ ret = try_get_cap_refs(inode, need, want, endoff,
2970
+ flags, &_got);
2971
+ WARN_ON_ONCE(ret == -EAGAIN);
2972
+ if (!ret) {
2973
+ struct ceph_mds_client *mdsc = fsc->mdsc;
2974
+ struct cap_wait cw;
27572975 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2976
+
2977
+ cw.ino = ceph_ino(inode);
2978
+ cw.tgid = current->tgid;
2979
+ cw.need = need;
2980
+ cw.want = want;
2981
+
2982
+ spin_lock(&mdsc->caps_list_lock);
2983
+ list_add(&cw.list, &mdsc->cap_wait_list);
2984
+ spin_unlock(&mdsc->caps_list_lock);
2985
+
2986
+ /* make sure used fmode not timeout */
2987
+ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
27582988 add_wait_queue(&ci->i_cap_wq, &wait);
27592989
2760
- while (!try_get_cap_refs(ci, need, want, endoff,
2761
- true, &_got, &err)) {
2990
+ flags |= NON_BLOCKING;
2991
+ while (!(ret = try_get_cap_refs(inode, need, want,
2992
+ endoff, flags, &_got))) {
27622993 if (signal_pending(current)) {
27632994 ret = -ERESTARTSYS;
27642995 break;
....@@ -2767,27 +2998,48 @@
27672998 }
27682999
27693000 remove_wait_queue(&ci->i_cap_wq, &wait);
3001
+ ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
27703002
2771
- if (err == -EAGAIN)
3003
+ spin_lock(&mdsc->caps_list_lock);
3004
+ list_del(&cw.list);
3005
+ spin_unlock(&mdsc->caps_list_lock);
3006
+
3007
+ if (ret == -EAGAIN)
27723008 continue;
2773
- if (err < 0)
2774
- ret = err;
27753009 }
3010
+
3011
+ if ((fi->fmode & CEPH_FILE_MODE_WR) &&
3012
+ fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
3013
+ if (ret >= 0 && _got)
3014
+ ceph_put_cap_refs(ci, _got);
3015
+ return -EBADF;
3016
+ }
3017
+
27763018 if (ret < 0) {
2777
- if (err == -ESTALE) {
3019
+ if (ret == -EFBIG || ret == -ESTALE) {
3020
+ int ret2 = ceph_wait_on_async_create(inode);
3021
+ if (ret2 < 0)
3022
+ return ret2;
3023
+ }
3024
+ if (ret == -EFBIG) {
3025
+ check_max_size(inode, endoff);
3026
+ continue;
3027
+ }
3028
+ if (ret == -ESTALE) {
27783029 /* session was killed, try renew caps */
2779
- ret = ceph_renew_caps(&ci->vfs_inode);
3030
+ ret = ceph_renew_caps(inode, flags);
27803031 if (ret == 0)
27813032 continue;
27823033 }
27833034 return ret;
27843035 }
27853036
2786
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
3037
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
3038
+ ci->i_inline_version != CEPH_INLINE_NONE &&
27873039 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
2788
- i_size_read(&ci->vfs_inode) > 0) {
3040
+ i_size_read(inode) > 0) {
27893041 struct page *page =
2790
- find_get_page(ci->vfs_inode.i_mapping, 0);
3042
+ find_get_page(inode->i_mapping, 0);
27913043 if (page) {
27923044 if (PageUptodate(page)) {
27933045 *pinned_page = page;
....@@ -2806,7 +3058,7 @@
28063058 * getattr request will bring inline data into
28073059 * page cache
28083060 */
2809
- ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
3061
+ ret = __ceph_do_getattr(inode, NULL,
28103062 CEPH_STAT_CAP_INLINE_DATA,
28113063 true);
28123064 if (ret < 0)
....@@ -2816,7 +3068,8 @@
28163068 break;
28173069 }
28183070
2819
- if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
3071
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
3072
+ (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
28203073 ceph_fscache_revalidate_cookie(ci);
28213074
28223075 *got = _got;
....@@ -2830,7 +3083,7 @@
28303083 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
28313084 {
28323085 spin_lock(&ci->i_ceph_lock);
2833
- __take_cap_refs(ci, caps, false);
3086
+ ceph_take_cap_refs(ci, caps, false);
28343087 spin_unlock(&ci->i_ceph_lock);
28353088 }
28363089
....@@ -2867,7 +3120,8 @@
28673120 * If we are releasing a WR cap (from a sync write), finalize any affected
28683121 * cap_snap, and wake up any waiters.
28693122 */
2870
-void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
3123
+static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
3124
+ bool skip_checking_caps)
28713125 {
28723126 struct inode *inode = &ci->vfs_inode;
28733127 int last = 0, put = 0, flushsnaps = 0, wake = 0;
....@@ -2880,6 +3134,9 @@
28803134 last++;
28813135 if (had & CEPH_CAP_FILE_CACHE)
28823136 if (--ci->i_rdcache_ref == 0)
3137
+ last++;
3138
+ if (had & CEPH_CAP_FILE_EXCL)
3139
+ if (--ci->i_fx_ref == 0)
28833140 last++;
28843141 if (had & CEPH_CAP_FILE_BUFFER) {
28853142 if (--ci->i_wb_ref == 0) {
....@@ -2912,7 +3169,7 @@
29123169 ci->i_head_snapc = NULL;
29133170 }
29143171 /* see comment in __ceph_remove_cap() */
2915
- if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
3172
+ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
29163173 drop_inode_snap_realm(ci);
29173174 }
29183175 spin_unlock(&ci->i_ceph_lock);
....@@ -2920,14 +3177,26 @@
29203177 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
29213178 last ? " last" : "", put ? " put" : "");
29223179
2923
- if (last && !flushsnaps)
2924
- ceph_check_caps(ci, 0, NULL);
2925
- else if (flushsnaps)
2926
- ceph_flush_snaps(ci, NULL);
3180
+ if (!skip_checking_caps) {
3181
+ if (last)
3182
+ ceph_check_caps(ci, 0, NULL);
3183
+ else if (flushsnaps)
3184
+ ceph_flush_snaps(ci, NULL);
3185
+ }
29273186 if (wake)
29283187 wake_up_all(&ci->i_cap_wq);
29293188 while (put-- > 0)
29303189 iput(inode);
3190
+}
3191
+
3192
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
3193
+{
3194
+ __ceph_put_cap_refs(ci, had, false);
3195
+}
3196
+
3197
+void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
3198
+{
3199
+ __ceph_put_cap_refs(ci, had, true);
29313200 }
29323201
29333202 /*
....@@ -2977,7 +3246,16 @@
29773246 break;
29783247 }
29793248 }
2980
- BUG_ON(!found);
3249
+
3250
+ if (!found) {
3251
+ /*
3252
+ * The capsnap should already be removed when removing
3253
+ * auth cap in the case of a forced unmount.
3254
+ */
3255
+ WARN_ON_ONCE(ci->i_auth_cap);
3256
+ goto unlock;
3257
+ }
3258
+
29813259 capsnap->dirty_pages -= nr;
29823260 if (capsnap->dirty_pages == 0) {
29833261 complete_capsnap = true;
....@@ -2999,17 +3277,20 @@
29993277 complete_capsnap ? " (complete capsnap)" : "");
30003278 }
30013279
3280
+unlock:
30023281 spin_unlock(&ci->i_ceph_lock);
30033282
30043283 if (last) {
3005
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
3284
+ ceph_check_caps(ci, 0, NULL);
30063285 } else if (flush_snaps) {
30073286 ceph_flush_snaps(ci, NULL);
30083287 }
30093288 if (complete_capsnap)
30103289 wake_up_all(&ci->i_cap_wq);
3011
- while (put-- > 0)
3012
- iput(inode);
3290
+ while (put-- > 0) {
3291
+ /* avoid calling iput_final() in osd dispatch threads */
3292
+ ceph_async_iput(inode);
3293
+ }
30133294 }
30143295
30153296 /*
....@@ -3054,8 +3335,10 @@
30543335 bool dirstat_valid;
30553336 u64 nfiles;
30563337 u64 nsubdirs;
3338
+ u64 change_attr;
30573339 /* currently issued */
30583340 int issued;
3341
+ struct timespec64 btime;
30593342 };
30603343
30613344 /*
....@@ -3079,7 +3362,8 @@
30793362 int used, wanted, dirty;
30803363 u64 size = le64_to_cpu(grant->size);
30813364 u64 max_size = le64_to_cpu(grant->max_size);
3082
- int check_caps = 0;
3365
+ unsigned char check_caps = 0;
3366
+ bool was_stale = cap->cap_gen < session->s_cap_gen;
30833367 bool wake = false;
30843368 bool writeback = false;
30853369 bool queue_trunc = false;
....@@ -3092,6 +3376,28 @@
30923376 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
30933377 inode->i_size);
30943378
3379
+
3380
+ /*
3381
+ * If CACHE is being revoked, and we have no dirty buffers,
3382
+ * try to invalidate (once). (If there are dirty buffers, we
3383
+ * will invalidate _after_ writeback.)
3384
+ */
3385
+ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
3386
+ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3387
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
3388
+ !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
3389
+ if (try_nonblocking_invalidate(inode)) {
3390
+ /* there were locked pages.. invalidate later
3391
+ in a separate thread. */
3392
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3393
+ queue_invalidate = true;
3394
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
3395
+ }
3396
+ }
3397
+ }
3398
+
3399
+ if (was_stale)
3400
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
30953401
30963402 /*
30973403 * auth mds of the inode changed. we received the cap export message,
....@@ -3108,36 +3414,20 @@
31083414 newcaps |= cap->issued;
31093415 }
31103416
3111
- /*
3112
- * If CACHE is being revoked, and we have no dirty buffers,
3113
- * try to invalidate (once). (If there are dirty buffers, we
3114
- * will invalidate _after_ writeback.)
3115
- */
3116
- if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
3117
- ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3118
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
3119
- !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
3120
- if (try_nonblocking_invalidate(inode)) {
3121
- /* there were locked pages.. invalidate later
3122
- in a separate thread. */
3123
- if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3124
- queue_invalidate = true;
3125
- ci->i_rdcache_revoking = ci->i_rdcache_gen;
3126
- }
3127
- }
3128
- }
3129
-
31303417 /* side effects now are allowed */
31313418 cap->cap_gen = session->s_cap_gen;
31323419 cap->seq = seq;
31333420
31343421 __check_cap_issue(ci, cap, newcaps);
31353422
3423
+ inode_set_max_iversion_raw(inode, extra_info->change_attr);
3424
+
31363425 if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
31373426 (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
31383427 inode->i_mode = le32_to_cpu(grant->mode);
31393428 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
31403429 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
3430
+ ci->i_btime = extra_info->btime;
31413431 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
31423432 from_kuid(&init_user_ns, inode->i_uid),
31433433 from_kgid(&init_user_ns, inode->i_gid));
....@@ -3164,6 +3454,7 @@
31643454 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
31653455 ci->i_xattrs.version = version;
31663456 ceph_forget_all_cached_acls(inode);
3457
+ ceph_security_invalidate_secctx(inode);
31673458 }
31683459 }
31693460
....@@ -3216,10 +3507,6 @@
32163507 ci->i_requested_max_size = 0;
32173508 }
32183509 wake = true;
3219
- } else if (ci->i_wanted_max_size > ci->i_max_size &&
3220
- ci->i_wanted_max_size > ci->i_requested_max_size) {
3221
- /* CEPH_CAP_OP_IMPORT */
3222
- wake = true;
32233510 }
32243511 }
32253512
....@@ -3231,13 +3518,20 @@
32313518 ceph_cap_string(wanted),
32323519 ceph_cap_string(used),
32333520 ceph_cap_string(dirty));
3234
- if (wanted != le32_to_cpu(grant->wanted)) {
3235
- dout("mds wanted %s -> %s\n",
3236
- ceph_cap_string(le32_to_cpu(grant->wanted)),
3237
- ceph_cap_string(wanted));
3238
- /* imported cap may not have correct mds_wanted */
3239
- if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
3240
- check_caps = 1;
3521
+
3522
+ if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3523
+ (wanted & ~(cap->mds_wanted | newcaps))) {
3524
+ /*
3525
+ * If mds is importing cap, prior cap messages that update
3526
+ * 'wanted' may get dropped by mds (migrate seq mismatch).
3527
+ *
3528
+ * We don't send cap message to update 'wanted' if what we
3529
+ * want are already issued. If mds revokes caps, cap message
3530
+ * that releases caps also tells mds what we want. But if
3531
+ * caps got revoked by mds forcedly (session stale). We may
3532
+ * haven't told mds what we want.
3533
+ */
3534
+ check_caps = 1;
32413535 }
32423536
32433537 /* revocation, grant, or no-op? */
....@@ -3248,11 +3542,12 @@
32483542 ceph_cap_string(cap->issued),
32493543 ceph_cap_string(newcaps),
32503544 ceph_cap_string(revoking));
3251
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
3545
+ if (S_ISREG(inode->i_mode) &&
3546
+ (revoking & used & CEPH_CAP_FILE_BUFFER))
32523547 writeback = true; /* initiate writeback; will delay ack */
3253
- else if (revoking == CEPH_CAP_FILE_CACHE &&
3254
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
3255
- queue_invalidate)
3548
+ else if (queue_invalidate &&
3549
+ revoking == CEPH_CAP_FILE_CACHE &&
3550
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
32563551 ; /* do nothing yet, invalidation will be queued */
32573552 else if (cap == ci->i_auth_cap)
32583553 check_caps = 1; /* check auth cap only */
....@@ -3279,6 +3574,15 @@
32793574 }
32803575 BUG_ON(cap->issued & ~cap->implemented);
32813576
3577
+ /* don't let check_caps skip sending a response to MDS for revoke msgs */
3578
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
3579
+ cap->mds_wanted = 0;
3580
+ if (cap == ci->i_auth_cap)
3581
+ check_caps = 1; /* check auth cap only */
3582
+ else
3583
+ check_caps = 2; /* check all caps */
3584
+ }
3585
+
32823586 if (extra_info->inline_version > 0 &&
32833587 extra_info->inline_version >= ci->i_inline_version) {
32843588 ci->i_inline_version = extra_info->inline_version;
....@@ -3288,13 +3592,22 @@
32883592 }
32893593
32903594 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
3291
- if (newcaps & ~extra_info->issued)
3292
- wake = true;
3293
- kick_flushing_inode_caps(session->s_mdsc, session, inode);
3595
+ if (ci->i_auth_cap == cap) {
3596
+ if (newcaps & ~extra_info->issued)
3597
+ wake = true;
3598
+
3599
+ if (ci->i_requested_max_size > max_size ||
3600
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
3601
+ /* re-request max_size if necessary */
3602
+ ci->i_requested_max_size = 0;
3603
+ wake = true;
3604
+ }
3605
+
3606
+ ceph_kick_flushing_inode_caps(session, ci);
3607
+ }
32943608 up_read(&session->s_mdsc->snap_rwsem);
3295
- } else {
3296
- spin_unlock(&ci->i_ceph_lock);
32973609 }
3610
+ spin_unlock(&ci->i_ceph_lock);
32983611
32993612 if (fill_inline)
33003613 ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
....@@ -3318,10 +3631,10 @@
33183631 wake_up_all(&ci->i_cap_wq);
33193632
33203633 if (check_caps == 1)
3321
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
3634
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
33223635 session);
33233636 else if (check_caps == 2)
3324
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
3637
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
33253638 else
33263639 mutex_unlock(&session->s_mutex);
33273640 }
....@@ -3348,15 +3661,26 @@
33483661 bool wake_mdsc = false;
33493662
33503663 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
3664
+ /* Is this the one that was flushed? */
33513665 if (cf->tid == flush_tid)
33523666 cleaned = cf->caps;
3353
- if (cf->caps == 0) /* capsnap */
3667
+
3668
+ /* Is this a capsnap? */
3669
+ if (cf->is_capsnap)
33543670 continue;
3671
+
33553672 if (cf->tid <= flush_tid) {
3356
- if (__finish_cap_flush(NULL, ci, cf))
3357
- wake_ci = true;
3673
+ /*
3674
+ * An earlier or current tid. The FLUSH_ACK should
3675
+ * represent a superset of this flush's caps.
3676
+ */
3677
+ wake_ci |= __detach_cap_flush_from_ci(ci, cf);
33583678 list_add_tail(&cf->i_list, &to_remove);
33593679 } else {
3680
+ /*
3681
+ * This is a later one. Any caps in it are still dirty
3682
+ * so don't count them as cleaned.
3683
+ */
33603684 cleaned &= ~cf->caps;
33613685 if (!cleaned)
33623686 break;
....@@ -3376,10 +3700,8 @@
33763700
33773701 spin_lock(&mdsc->cap_dirty_lock);
33783702
3379
- list_for_each_entry(cf, &to_remove, i_list) {
3380
- if (__finish_cap_flush(mdsc, NULL, cf))
3381
- wake_mdsc = true;
3382
- }
3703
+ list_for_each_entry(cf, &to_remove, i_list)
3704
+ wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
33833705
33843706 if (ci->i_flushing_caps == 0) {
33853707 if (list_empty(&ci->i_cap_flush_list)) {
....@@ -3417,8 +3739,9 @@
34173739 while (!list_empty(&to_remove)) {
34183740 cf = list_first_entry(&to_remove,
34193741 struct ceph_cap_flush, i_list);
3420
- list_del(&cf->i_list);
3421
- ceph_free_cap_flush(cf);
3742
+ list_del_init(&cf->i_list);
3743
+ if (!cf->is_capsnap)
3744
+ ceph_free_cap_flush(cf);
34223745 }
34233746
34243747 if (wake_ci)
....@@ -3427,6 +3750,43 @@
34273750 wake_up_all(&mdsc->cap_flushing_wq);
34283751 if (drop)
34293752 iput(inode);
3753
+}
3754
+
3755
+void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3756
+ bool *wake_ci, bool *wake_mdsc)
3757
+{
3758
+ struct ceph_inode_info *ci = ceph_inode(inode);
3759
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
3760
+ bool ret;
3761
+
3762
+ lockdep_assert_held(&ci->i_ceph_lock);
3763
+
3764
+ dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
3765
+
3766
+ list_del_init(&capsnap->ci_item);
3767
+ ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
3768
+ if (wake_ci)
3769
+ *wake_ci = ret;
3770
+
3771
+ spin_lock(&mdsc->cap_dirty_lock);
3772
+ if (list_empty(&ci->i_cap_flush_list))
3773
+ list_del_init(&ci->i_flushing_item);
3774
+
3775
+ ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
3776
+ if (wake_mdsc)
3777
+ *wake_mdsc = ret;
3778
+ spin_unlock(&mdsc->cap_dirty_lock);
3779
+}
3780
+
3781
+void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3782
+ bool *wake_ci, bool *wake_mdsc)
3783
+{
3784
+ struct ceph_inode_info *ci = ceph_inode(inode);
3785
+
3786
+ lockdep_assert_held(&ci->i_ceph_lock);
3787
+
3788
+ WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
3789
+ __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
34303790 }
34313791
34323792 /*
....@@ -3466,25 +3826,10 @@
34663826 capsnap, capsnap->follows);
34673827 }
34683828 }
3469
- if (flushed) {
3470
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
3471
- dout(" removing %p cap_snap %p follows %lld\n",
3472
- inode, capsnap, follows);
3473
- list_del(&capsnap->ci_item);
3474
- if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
3475
- wake_ci = true;
3476
-
3477
- spin_lock(&mdsc->cap_dirty_lock);
3478
-
3479
- if (list_empty(&ci->i_cap_flush_list))
3480
- list_del_init(&ci->i_flushing_item);
3481
-
3482
- if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
3483
- wake_mdsc = true;
3484
-
3485
- spin_unlock(&mdsc->cap_dirty_lock);
3486
- }
3829
+ if (flushed)
3830
+ ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
34873831 spin_unlock(&ci->i_ceph_lock);
3832
+
34883833 if (flushed) {
34893834 ceph_put_snap_context(capsnap->context);
34903835 ceph_put_cap_snap(capsnap);
....@@ -3501,10 +3846,9 @@
35013846 *
35023847 * caller hold s_mutex.
35033848 */
3504
-static void handle_cap_trunc(struct inode *inode,
3849
+static bool handle_cap_trunc(struct inode *inode,
35053850 struct ceph_mds_caps *trunc,
35063851 struct ceph_mds_session *session)
3507
- __releases(ci->i_ceph_lock)
35083852 {
35093853 struct ceph_inode_info *ci = ceph_inode(inode);
35103854 int mds = session->s_mds;
....@@ -3515,7 +3859,9 @@
35153859 int implemented = 0;
35163860 int dirty = __ceph_caps_dirty(ci);
35173861 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
3518
- int queue_trunc = 0;
3862
+ bool queue_trunc = false;
3863
+
3864
+ lockdep_assert_held(&ci->i_ceph_lock);
35193865
35203866 issued |= implemented | dirty;
35213867
....@@ -3523,10 +3869,7 @@
35233869 inode, mds, seq, truncate_size, truncate_seq);
35243870 queue_trunc = ceph_fill_file_size(inode, issued,
35253871 truncate_seq, truncate_size, size);
3526
- spin_unlock(&ci->i_ceph_lock);
3527
-
3528
- if (queue_trunc)
3529
- ceph_queue_vmtruncate(inode);
3872
+ return queue_trunc;
35303873 }
35313874
35323875 /*
....@@ -3571,8 +3914,6 @@
35713914
35723915 if (target < 0) {
35733916 __ceph_remove_cap(cap, false);
3574
- if (!ci->i_auth_cap)
3575
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
35763917 goto out_unlock;
35773918 }
35783919
....@@ -3602,15 +3943,9 @@
36023943 tcap->issue_seq = t_seq - 1;
36033944 tcap->issued |= issued;
36043945 tcap->implemented |= issued;
3605
- if (cap == ci->i_auth_cap)
3946
+ if (cap == ci->i_auth_cap) {
36063947 ci->i_auth_cap = tcap;
3607
-
3608
- if (!list_empty(&ci->i_cap_flush_list) &&
3609
- ci->i_auth_cap == tcap) {
3610
- spin_lock(&mdsc->cap_dirty_lock);
3611
- list_move_tail(&ci->i_flushing_item,
3612
- &tcap->session->s_cap_flushing);
3613
- spin_unlock(&mdsc->cap_dirty_lock);
3948
+ change_auth_cap_ses(ci, tcap->session);
36143949 }
36153950 }
36163951 __ceph_remove_cap(cap, false);
....@@ -3619,7 +3954,7 @@
36193954 /* add placeholder for the export tagert */
36203955 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
36213956 tcap = new_cap;
3622
- ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
3957
+ ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
36233958 t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
36243959
36253960 if (!list_empty(&ci->i_cap_flush_list) &&
....@@ -3679,7 +4014,6 @@
36794014 struct ceph_mds_cap_peer *ph,
36804015 struct ceph_mds_session *session,
36814016 struct ceph_cap **target_cap, int *old_issued)
3682
- __acquires(ci->i_ceph_lock)
36834017 {
36844018 struct ceph_inode_info *ci = ceph_inode(inode);
36854019 struct ceph_cap *cap, *ocap, *new_cap = NULL;
....@@ -3704,14 +4038,13 @@
37044038
37054039 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
37064040 inode, ci, mds, mseq, peer);
3707
-
37084041 retry:
3709
- spin_lock(&ci->i_ceph_lock);
37104042 cap = __get_cap_for_mds(ci, mds);
37114043 if (!cap) {
37124044 if (!new_cap) {
37134045 spin_unlock(&ci->i_ceph_lock);
37144046 new_cap = ceph_get_cap(mdsc, NULL);
4047
+ spin_lock(&ci->i_ceph_lock);
37154048 goto retry;
37164049 }
37174050 cap = new_cap;
....@@ -3725,7 +4058,7 @@
37254058 __ceph_caps_issued(ci, &issued);
37264059 issued |= __ceph_caps_dirty(ci);
37274060
3728
- ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
4061
+ ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
37294062 realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
37304063
37314064 ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
....@@ -3745,9 +4078,6 @@
37454078 }
37464079 __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
37474080 }
3748
-
3749
- /* make sure we re-request max_size, if necessary */
3750
- ci->i_requested_max_size = 0;
37514081
37524082 *old_issued = issued;
37534083 *target_cap = cap;
....@@ -3777,6 +4107,7 @@
37774107 size_t snaptrace_len;
37784108 void *p, *end;
37794109 struct cap_extra_info extra_info = {};
4110
+ bool queue_trunc;
37804111
37814112 dout("handle_caps from mds%d\n", session->s_mds);
37824113
....@@ -3852,17 +4183,19 @@
38524183 }
38534184 }
38544185
3855
- if (msg_version >= 11) {
4186
+ if (msg_version >= 9) {
38564187 struct ceph_timespec *btime;
3857
- u64 change_attr;
3858
- u32 flags;
38594188
3860
- /* version >= 9 */
38614189 if (p + sizeof(*btime) > end)
38624190 goto bad;
38634191 btime = p;
4192
+ ceph_decode_timespec64(&extra_info.btime, btime);
38644193 p += sizeof(*btime);
3865
- ceph_decode_64_safe(&p, end, change_attr, bad);
4194
+ ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
4195
+ }
4196
+
4197
+ if (msg_version >= 11) {
4198
+ u32 flags;
38664199 /* version >= 10 */
38674200 ceph_decode_32_safe(&p, end, flags, bad);
38684201 /* version >= 11 */
....@@ -3878,7 +4211,7 @@
38784211 vino.snap, inode);
38794212
38804213 mutex_lock(&session->s_mutex);
3881
- session->s_seq++;
4214
+ inc_session_sequence(session);
38824215 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
38834216 (unsigned)seq);
38844217
....@@ -3894,9 +4227,7 @@
38944227 cap->seq = seq;
38954228 cap->issue_seq = seq;
38964229 spin_lock(&session->s_cap_lock);
3897
- list_add_tail(&cap->session_caps,
3898
- &session->s_cap_releases);
3899
- session->s_num_cap_releases++;
4230
+ __ceph_queue_cap_release(session, cap);
39004231 spin_unlock(&session->s_cap_lock);
39014232 }
39024233 goto flush_cap_releases;
....@@ -3924,6 +4255,7 @@
39244255 } else {
39254256 down_read(&mdsc->snap_rwsem);
39264257 }
4258
+ spin_lock(&ci->i_ceph_lock);
39274259 handle_cap_import(mdsc, inode, h, peer, session,
39284260 &cap, &extra_info.issued);
39294261 handle_cap_grant(inode, session, cap,
....@@ -3960,7 +4292,10 @@
39604292 break;
39614293
39624294 case CEPH_CAP_OP_TRUNC:
3963
- handle_cap_trunc(inode, h, session);
4295
+ queue_trunc = handle_cap_trunc(inode, h, session);
4296
+ spin_unlock(&ci->i_ceph_lock);
4297
+ if (queue_trunc)
4298
+ ceph_queue_vmtruncate(inode);
39644299 break;
39654300
39664301 default:
....@@ -3969,7 +4304,13 @@
39694304 ceph_cap_op_name(op));
39704305 }
39714306
3972
- goto done;
4307
+done:
4308
+ mutex_unlock(&session->s_mutex);
4309
+done_unlocked:
4310
+ ceph_put_string(extra_info.pool_ns);
4311
+ /* avoid calling iput_final() in mds dispatch threads */
4312
+ ceph_async_iput(inode);
4313
+ return;
39734314
39744315 flush_cap_releases:
39754316 /*
....@@ -3977,14 +4318,8 @@
39774318 * along for the mds (who clearly thinks we still have this
39784319 * cap).
39794320 */
3980
- ceph_send_cap_releases(mdsc, session);
3981
-
3982
-done:
3983
- mutex_unlock(&session->s_mutex);
3984
-done_unlocked:
3985
- iput(inode);
3986
- ceph_put_string(extra_info.pool_ns);
3987
- return;
4321
+ ceph_flush_cap_releases(mdsc, session);
4322
+ goto done;
39884323
39894324 bad:
39904325 pr_err("ceph_handle_caps: corrupt message\n");
....@@ -3994,56 +4329,70 @@
39944329
39954330 /*
39964331 * Delayed work handler to process end of delayed cap release LRU list.
4332
+ *
4333
+ * If new caps are added to the list while processing it, these won't get
4334
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
4335
+ * returned so that the work can be scheduled accordingly.
39974336 */
3998
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4337
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
39994338 {
40004339 struct inode *inode;
40014340 struct ceph_inode_info *ci;
4002
- int flags = CHECK_CAPS_NODELAY;
4341
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
4342
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
4343
+ unsigned long loop_start = jiffies;
4344
+ unsigned long delay = 0;
40034345
40044346 dout("check_delayed_caps\n");
4005
- while (1) {
4006
- spin_lock(&mdsc->cap_delay_lock);
4007
- if (list_empty(&mdsc->cap_delay_list))
4008
- break;
4347
+ spin_lock(&mdsc->cap_delay_lock);
4348
+ while (!list_empty(&mdsc->cap_delay_list)) {
40094349 ci = list_first_entry(&mdsc->cap_delay_list,
40104350 struct ceph_inode_info,
40114351 i_cap_delay_list);
4352
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
4353
+ dout("%s caps added recently. Exiting loop", __func__);
4354
+ delay = ci->i_hold_caps_max;
4355
+ break;
4356
+ }
40124357 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
40134358 time_before(jiffies, ci->i_hold_caps_max))
40144359 break;
40154360 list_del_init(&ci->i_cap_delay_list);
40164361
40174362 inode = igrab(&ci->vfs_inode);
4018
- spin_unlock(&mdsc->cap_delay_lock);
4019
-
40204363 if (inode) {
4364
+ spin_unlock(&mdsc->cap_delay_lock);
40214365 dout("check_delayed_caps on %p\n", inode);
4022
- ceph_check_caps(ci, flags, NULL);
4023
- iput(inode);
4366
+ ceph_check_caps(ci, 0, NULL);
4367
+ /* avoid calling iput_final() in tick thread */
4368
+ ceph_async_iput(inode);
4369
+ spin_lock(&mdsc->cap_delay_lock);
40244370 }
40254371 }
40264372 spin_unlock(&mdsc->cap_delay_lock);
4373
+
4374
+ return delay;
40274375 }
40284376
40294377 /*
40304378 * Flush all dirty caps to the mds
40314379 */
4032
-void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
4380
+static void flush_dirty_session_caps(struct ceph_mds_session *s)
40334381 {
4382
+ struct ceph_mds_client *mdsc = s->s_mdsc;
40344383 struct ceph_inode_info *ci;
40354384 struct inode *inode;
40364385
40374386 dout("flush_dirty_caps\n");
40384387 spin_lock(&mdsc->cap_dirty_lock);
4039
- while (!list_empty(&mdsc->cap_dirty)) {
4040
- ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
4388
+ while (!list_empty(&s->s_cap_dirty)) {
4389
+ ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
40414390 i_dirty_item);
40424391 inode = &ci->vfs_inode;
40434392 ihold(inode);
40444393 dout("flush_dirty_caps %p\n", inode);
40454394 spin_unlock(&mdsc->cap_dirty_lock);
4046
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
4395
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
40474396 iput(inode);
40484397 spin_lock(&mdsc->cap_dirty_lock);
40494398 }
....@@ -4051,14 +4400,53 @@
40514400 dout("flush_dirty_caps done\n");
40524401 }
40534402
4054
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
4403
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
40554404 {
4056
- int i;
4405
+ ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
4406
+}
4407
+
4408
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
4409
+ struct ceph_mds_client *mdsc, int fmode)
4410
+{
4411
+ unsigned long now = jiffies;
4412
+ if (fmode & CEPH_FILE_MODE_RD)
4413
+ ci->i_last_rd = now;
4414
+ if (fmode & CEPH_FILE_MODE_WR)
4415
+ ci->i_last_wr = now;
4416
+ /* queue periodic check */
4417
+ if (fmode &&
4418
+ __ceph_is_any_real_caps(ci) &&
4419
+ list_empty(&ci->i_cap_delay_list))
4420
+ __cap_delay_requeue(mdsc, ci);
4421
+}
4422
+
4423
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
4424
+{
4425
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
40574426 int bits = (fmode << 1) | 1;
4427
+ bool already_opened = false;
4428
+ int i;
4429
+
4430
+ if (count == 1)
4431
+ atomic64_inc(&mdsc->metric.opened_files);
4432
+
4433
+ spin_lock(&ci->i_ceph_lock);
40584434 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4435
+ /*
4436
+ * If any of the mode ref is larger than 0,
4437
+ * that means it has been already opened by
4438
+ * others. Just skip checking the PIN ref.
4439
+ */
4440
+ if (i && ci->i_nr_by_mode[i])
4441
+ already_opened = true;
4442
+
40594443 if (bits & (1 << i))
4060
- ci->i_nr_by_mode[i]++;
4444
+ ci->i_nr_by_mode[i] += count;
40614445 }
4446
+
4447
+ if (!already_opened)
4448
+ percpu_counter_inc(&mdsc->metric.opened_inodes);
4449
+ spin_unlock(&ci->i_ceph_lock);
40624450 }
40634451
40644452 /*
....@@ -4066,30 +4454,39 @@
40664454 * we may need to release capabilities to the MDS (or schedule
40674455 * their delayed release).
40684456 */
4069
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
4457
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
40704458 {
4071
- int i, last = 0;
4459
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
40724460 int bits = (fmode << 1) | 1;
4461
+ bool is_closed = true;
4462
+ int i;
4463
+
4464
+ if (count == 1)
4465
+ atomic64_dec(&mdsc->metric.opened_files);
4466
+
40734467 spin_lock(&ci->i_ceph_lock);
40744468 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
40754469 if (bits & (1 << i)) {
4076
- BUG_ON(ci->i_nr_by_mode[i] == 0);
4077
- if (--ci->i_nr_by_mode[i] == 0)
4078
- last++;
4470
+ BUG_ON(ci->i_nr_by_mode[i] < count);
4471
+ ci->i_nr_by_mode[i] -= count;
40794472 }
4080
- }
4081
- dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
4082
- &ci->vfs_inode, fmode,
4083
- ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
4084
- ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
4085
- spin_unlock(&ci->i_ceph_lock);
40864473
4087
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
4088
- ceph_check_caps(ci, 0, NULL);
4474
+ /*
4475
+ * If any of the mode ref is not 0 after
4476
+ * decreased, that means it is still opened
4477
+ * by others. Just skip checking the PIN ref.
4478
+ */
4479
+ if (i && ci->i_nr_by_mode[i])
4480
+ is_closed = false;
4481
+ }
4482
+
4483
+ if (is_closed)
4484
+ percpu_counter_dec(&mdsc->metric.opened_inodes);
4485
+ spin_unlock(&ci->i_ceph_lock);
40894486 }
40904487
40914488 /*
4092
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
4489
+ * For a soon-to-be unlinked file, drop the LINK caps. If it
40934490 * looks like the link count will hit 0, drop any other caps (other
40944491 * than PIN) we don't specifically want (due to the file still being
40954492 * open).
....@@ -4103,7 +4500,6 @@
41034500 if (inode->i_nlink == 1) {
41044501 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
41054502
4106
- ci->i_ceph_flags |= CEPH_I_NODELAY;
41074503 if (__ceph_caps_dirty(ci)) {
41084504 struct ceph_mds_client *mdsc =
41094505 ceph_inode_to_client(inode)->mdsc;
....@@ -4159,8 +4555,6 @@
41594555 if (force || (cap->issued & drop)) {
41604556 if (cap->issued & drop) {
41614557 int wanted = __ceph_caps_wanted(ci);
4162
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
4163
- wanted |= cap->mds_wanted;
41644558 dout("encode_inode_release %p cap %p "
41654559 "%s -> %s, wanted %s -> %s\n", inode, cap,
41664560 ceph_cap_string(cap->issued),
....@@ -4171,6 +4565,9 @@
41714565 cap->issued &= ~drop;
41724566 cap->implemented &= ~drop;
41734567 cap->mds_wanted = wanted;
4568
+ if (cap == ci->i_auth_cap &&
4569
+ !(wanted & CEPH_CAP_ANY_FILE_WR))
4570
+ ci->i_requested_max_size = 0;
41744571 } else {
41754572 dout("encode_inode_release %p cap %p %s"
41764573 " (force)\n", inode, cap,