forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/fs/ceph/caps.c
....@@ -8,6 +8,7 @@
88 #include <linux/vmalloc.h>
99 #include <linux/wait.h>
1010 #include <linux/writeback.h>
11
+#include <linux/iversion.h>
1112
1213 #include "super.h"
1314 #include "mds_client.h"
....@@ -148,11 +149,17 @@
148149 spin_unlock(&mdsc->caps_list_lock);
149150 }
150151
151
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
152
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
153
+ struct ceph_mount_options *fsopt)
152154 {
153155 spin_lock(&mdsc->caps_list_lock);
154
- mdsc->caps_min_count += delta;
155
- BUG_ON(mdsc->caps_min_count < 0);
156
+ mdsc->caps_min_count = fsopt->max_readdir;
157
+ if (mdsc->caps_min_count < 1024)
158
+ mdsc->caps_min_count = 1024;
159
+ mdsc->caps_use_max = fsopt->caps_max;
160
+ if (mdsc->caps_use_max > 0 &&
161
+ mdsc->caps_use_max < mdsc->caps_min_count)
162
+ mdsc->caps_use_max = mdsc->caps_min_count;
156163 spin_unlock(&mdsc->caps_list_lock);
157164 }
158165
....@@ -272,6 +279,7 @@
272279 if (!err) {
273280 BUG_ON(have + alloc != need);
274281 ctx->count = need;
282
+ ctx->used = 0;
275283 }
276284
277285 spin_lock(&mdsc->caps_list_lock);
....@@ -295,13 +303,24 @@
295303 }
296304
297305 void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
298
- struct ceph_cap_reservation *ctx)
306
+ struct ceph_cap_reservation *ctx)
299307 {
308
+ bool reclaim = false;
309
+ if (!ctx->count)
310
+ return;
311
+
300312 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
301313 spin_lock(&mdsc->caps_list_lock);
302314 __ceph_unreserve_caps(mdsc, ctx->count);
303315 ctx->count = 0;
316
+
317
+ if (mdsc->caps_use_max > 0 &&
318
+ mdsc->caps_use_count > mdsc->caps_use_max)
319
+ reclaim = true;
304320 spin_unlock(&mdsc->caps_list_lock);
321
+
322
+ if (reclaim)
323
+ ceph_reclaim_caps_nr(mdsc, ctx->used);
305324 }
306325
307326 struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
....@@ -346,6 +365,7 @@
346365 BUG_ON(list_empty(&mdsc->caps_list));
347366
348367 ctx->count--;
368
+ ctx->used++;
349369 mdsc->caps_reserve_count--;
350370 mdsc->caps_use_count++;
351371
....@@ -438,37 +458,6 @@
438458 }
439459
440460 /*
441
- * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
442
- */
443
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
444
-{
445
- struct ceph_cap *cap;
446
- int mds = -1;
447
- struct rb_node *p;
448
-
449
- /* prefer mds with WR|BUFFER|EXCL caps */
450
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
451
- cap = rb_entry(p, struct ceph_cap, ci_node);
452
- mds = cap->mds;
453
- if (cap->issued & (CEPH_CAP_FILE_WR |
454
- CEPH_CAP_FILE_BUFFER |
455
- CEPH_CAP_FILE_EXCL))
456
- break;
457
- }
458
- return mds;
459
-}
460
-
461
-int ceph_get_cap_mds(struct inode *inode)
462
-{
463
- struct ceph_inode_info *ci = ceph_inode(inode);
464
- int mds;
465
- spin_lock(&ci->i_ceph_lock);
466
- mds = __ceph_get_cap_mds(ceph_inode(inode));
467
- spin_unlock(&ci->i_ceph_lock);
468
- return mds;
469
-}
470
-
471
-/*
472461 * Called under i_ceph_lock.
473462 */
474463 static void __insert_cap_node(struct ceph_inode_info *ci,
....@@ -500,14 +489,11 @@
500489 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
501490 struct ceph_inode_info *ci)
502491 {
503
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
504
-
505
- ci->i_hold_caps_min = round_jiffies(jiffies +
506
- ma->caps_wanted_delay_min * HZ);
492
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
507493 ci->i_hold_caps_max = round_jiffies(jiffies +
508
- ma->caps_wanted_delay_max * HZ);
509
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
510
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
494
+ opt->caps_wanted_delay_max * HZ);
495
+ dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
496
+ ci->i_hold_caps_max - jiffies);
511497 }
512498
513499 /*
....@@ -521,8 +507,7 @@
521507 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
522508 struct ceph_inode_info *ci)
523509 {
524
- __cap_set_timeouts(mdsc, ci);
525
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
510
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
526511 ci->i_ceph_flags, ci->i_hold_caps_max);
527512 if (!mdsc->stopping) {
528513 spin_lock(&mdsc->cap_delay_lock);
....@@ -531,6 +516,7 @@
531516 goto no_change;
532517 list_del_init(&ci->i_cap_delay_list);
533518 }
519
+ __cap_set_timeouts(mdsc, ci);
534520 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
535521 no_change:
536522 spin_unlock(&mdsc->cap_delay_lock);
....@@ -570,19 +556,20 @@
570556 spin_unlock(&mdsc->cap_delay_lock);
571557 }
572558
573
-/*
574
- * Common issue checks for add_cap, handle_cap_grant.
575
- */
559
+/* Common issue checks for add_cap, handle_cap_grant. */
576560 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
577561 unsigned issued)
578562 {
579563 unsigned had = __ceph_caps_issued(ci, NULL);
580564
565
+ lockdep_assert_held(&ci->i_ceph_lock);
566
+
581567 /*
582568 * Each time we receive FILE_CACHE anew, we increment
583569 * i_rdcache_gen.
584570 */
585
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
571
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
572
+ (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
586573 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
587574 ci->i_rdcache_gen++;
588575 }
....@@ -601,12 +588,40 @@
601588 __ceph_dir_clear_complete(ci);
602589 }
603590 }
591
+
592
+ /* Wipe saved layout if we're losing DIR_CREATE caps */
593
+ if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
594
+ !(issued & CEPH_CAP_DIR_CREATE)) {
595
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
596
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
597
+ }
598
+}
599
+
600
+/**
601
+ * change_auth_cap_ses - move inode to appropriate lists when auth caps change
602
+ * @ci: inode to be moved
603
+ * @session: new auth caps session
604
+ */
605
+static void change_auth_cap_ses(struct ceph_inode_info *ci,
606
+ struct ceph_mds_session *session)
607
+{
608
+ lockdep_assert_held(&ci->i_ceph_lock);
609
+
610
+ if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
611
+ return;
612
+
613
+ spin_lock(&session->s_mdsc->cap_dirty_lock);
614
+ if (!list_empty(&ci->i_dirty_item))
615
+ list_move(&ci->i_dirty_item, &session->s_cap_dirty);
616
+ if (!list_empty(&ci->i_flushing_item))
617
+ list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
618
+ spin_unlock(&session->s_mdsc->cap_dirty_lock);
604619 }
605620
606621 /*
607622 * Add a capability under the given MDS session.
608623 *
609
- * Caller should hold session snap_rwsem (read) and s_mutex.
624
+ * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
610625 *
611626 * @fmode is the open file mode, if we are opening a file, otherwise
612627 * it is < 0. (This is so we can atomically add the cap and add an
....@@ -614,7 +629,7 @@
614629 */
615630 void ceph_add_cap(struct inode *inode,
616631 struct ceph_mds_session *session, u64 cap_id,
617
- int fmode, unsigned issued, unsigned wanted,
632
+ unsigned issued, unsigned wanted,
618633 unsigned seq, unsigned mseq, u64 realmino, int flags,
619634 struct ceph_cap **new_cap)
620635 {
....@@ -623,16 +638,16 @@
623638 struct ceph_cap *cap;
624639 int mds = session->s_mds;
625640 int actual_wanted;
641
+ u32 gen;
642
+
643
+ lockdep_assert_held(&ci->i_ceph_lock);
626644
627645 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
628646 session->s_mds, cap_id, ceph_cap_string(issued), seq);
629647
630
- /*
631
- * If we are opening the file, include file mode wanted bits
632
- * in wanted.
633
- */
634
- if (fmode >= 0)
635
- wanted |= ceph_caps_for_mode(fmode);
648
+ spin_lock(&session->s_gen_ttl_lock);
649
+ gen = session->s_cap_gen;
650
+ spin_unlock(&session->s_gen_ttl_lock);
636651
637652 cap = __get_cap_for_mds(ci, mds);
638653 if (!cap) {
....@@ -653,8 +668,16 @@
653668 spin_lock(&session->s_cap_lock);
654669 list_add_tail(&cap->session_caps, &session->s_caps);
655670 session->s_nr_caps++;
671
+ atomic64_inc(&mdsc->metric.total_caps);
656672 spin_unlock(&session->s_cap_lock);
657673 } else {
674
+ spin_lock(&session->s_cap_lock);
675
+ list_move_tail(&cap->session_caps, &session->s_caps);
676
+ spin_unlock(&session->s_cap_lock);
677
+
678
+ if (cap->cap_gen < gen)
679
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
680
+
658681 /*
659682 * auth mds of the inode changed. we received the cap export
660683 * message, but still haven't received the cap import message.
....@@ -726,6 +749,9 @@
726749 if (flags & CEPH_CAP_FLAG_AUTH) {
727750 if (!ci->i_auth_cap ||
728751 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
752
+ if (ci->i_auth_cap &&
753
+ ci->i_auth_cap->session != cap->session)
754
+ change_auth_cap_ses(ci, cap->session);
729755 ci->i_auth_cap = cap;
730756 cap->mds_wanted = wanted;
731757 }
....@@ -746,10 +772,7 @@
746772 cap->seq = seq;
747773 cap->issue_seq = seq;
748774 cap->mseq = mseq;
749
- cap->cap_gen = session->s_cap_gen;
750
-
751
- if (fmode >= 0)
752
- __ceph_get_fmode(ci, fmode);
775
+ cap->cap_gen = gen;
753776 }
754777
755778 /*
....@@ -864,8 +887,8 @@
864887 int have = ci->i_snap_caps;
865888
866889 if ((have & mask) == mask) {
867
- dout("__ceph_caps_issued_mask %p snap issued %s"
868
- " (mask %s)\n", &ci->vfs_inode,
890
+ dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
891
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
869892 ceph_cap_string(have),
870893 ceph_cap_string(mask));
871894 return 1;
....@@ -876,8 +899,8 @@
876899 if (!__cap_is_valid(cap))
877900 continue;
878901 if ((cap->issued & mask) == mask) {
879
- dout("__ceph_caps_issued_mask %p cap %p issued %s"
880
- " (mask %s)\n", &ci->vfs_inode, cap,
902
+ dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
903
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
881904 ceph_cap_string(cap->issued),
882905 ceph_cap_string(mask));
883906 if (touch)
....@@ -888,8 +911,8 @@
888911 /* does a combination of caps satisfy mask? */
889912 have |= cap->issued;
890913 if ((have & mask) == mask) {
891
- dout("__ceph_caps_issued_mask %p combo issued %s"
892
- " (mask %s)\n", &ci->vfs_inode,
914
+ dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
915
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
893916 ceph_cap_string(cap->issued),
894917 ceph_cap_string(mask));
895918 if (touch) {
....@@ -903,7 +926,8 @@
903926 ci_node);
904927 if (!__cap_is_valid(cap))
905928 continue;
906
- __touch_cap(cap);
929
+ if (cap->issued & mask)
930
+ __touch_cap(cap);
907931 }
908932 }
909933 return 1;
....@@ -911,6 +935,20 @@
911935 }
912936
913937 return 0;
938
+}
939
+
940
+int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
941
+ int touch)
942
+{
943
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
944
+ int r;
945
+
946
+ r = __ceph_caps_issued_mask(ci, mask, touch);
947
+ if (r)
948
+ ceph_update_cap_hit(&fsc->mdsc->metric);
949
+ else
950
+ ceph_update_cap_mis(&fsc->mdsc->metric);
951
+ return r;
914952 }
915953
916954 /*
....@@ -952,29 +990,97 @@
952990 if (ci->i_rd_ref)
953991 used |= CEPH_CAP_FILE_RD;
954992 if (ci->i_rdcache_ref ||
955
- (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
993
+ (S_ISREG(ci->vfs_inode.i_mode) &&
956994 ci->vfs_inode.i_data.nrpages))
957995 used |= CEPH_CAP_FILE_CACHE;
958996 if (ci->i_wr_ref)
959997 used |= CEPH_CAP_FILE_WR;
960998 if (ci->i_wb_ref || ci->i_wrbuffer_ref)
961999 used |= CEPH_CAP_FILE_BUFFER;
1000
+ if (ci->i_fx_ref)
1001
+ used |= CEPH_CAP_FILE_EXCL;
9621002 return used;
9631003 }
1004
+
1005
+#define FMODE_WAIT_BIAS 1000
9641006
9651007 /*
9661008 * wanted, by virtue of open file modes
9671009 */
9681010 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
9691011 {
970
- int i, bits = 0;
971
- for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
972
- if (ci->i_nr_by_mode[i])
973
- bits |= 1 << i;
1012
+ const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
1013
+ const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
1014
+ const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
1015
+ const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
1016
+ struct ceph_mount_options *opt =
1017
+ ceph_inode_to_client(&ci->vfs_inode)->mount_options;
1018
+ unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
1019
+ unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
1020
+
1021
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
1022
+ int want = 0;
1023
+
1024
+ /* use used_cutoff here, to keep dir's wanted caps longer */
1025
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
1026
+ time_after(ci->i_last_rd, used_cutoff))
1027
+ want |= CEPH_CAP_ANY_SHARED;
1028
+
1029
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
1030
+ time_after(ci->i_last_wr, used_cutoff)) {
1031
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1032
+ if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1033
+ want |= CEPH_CAP_ANY_DIR_OPS;
1034
+ }
1035
+
1036
+ if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
1037
+ want |= CEPH_CAP_PIN;
1038
+
1039
+ return want;
1040
+ } else {
1041
+ int bits = 0;
1042
+
1043
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
1044
+ if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
1045
+ time_after(ci->i_last_rd, used_cutoff))
1046
+ bits |= 1 << RD_SHIFT;
1047
+ } else if (time_after(ci->i_last_rd, idle_cutoff)) {
1048
+ bits |= 1 << RD_SHIFT;
1049
+ }
1050
+
1051
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
1052
+ if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
1053
+ time_after(ci->i_last_wr, used_cutoff))
1054
+ bits |= 1 << WR_SHIFT;
1055
+ } else if (time_after(ci->i_last_wr, idle_cutoff)) {
1056
+ bits |= 1 << WR_SHIFT;
1057
+ }
1058
+
1059
+ /* check lazyio only when read/write is wanted */
1060
+ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
1061
+ ci->i_nr_by_mode[LAZY_SHIFT] > 0)
1062
+ bits |= 1 << LAZY_SHIFT;
1063
+
1064
+ return bits ? ceph_caps_for_mode(bits >> 1) : 0;
9741065 }
975
- if (bits == 0)
976
- return 0;
977
- return ceph_caps_for_mode(bits >> 1);
1066
+}
1067
+
1068
+/*
1069
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1070
+ */
1071
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
1072
+{
1073
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
1074
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
1075
+ /* we want EXCL if holding caps of dir ops */
1076
+ if (w & CEPH_CAP_ANY_DIR_OPS)
1077
+ w |= CEPH_CAP_FILE_EXCL;
1078
+ } else {
1079
+ /* we want EXCL if dirty data */
1080
+ if (w & CEPH_CAP_FILE_BUFFER)
1081
+ w |= CEPH_CAP_FILE_EXCL;
1082
+ }
1083
+ return w;
9781084 }
9791085
9801086 /*
....@@ -998,26 +1104,13 @@
9981104 return mds_wanted;
9991105 }
10001106
1001
-/*
1002
- * called under i_ceph_lock
1003
- */
1004
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
1005
-{
1006
- return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
1007
-}
1008
-
1009
-static int __ceph_is_any_caps(struct ceph_inode_info *ci)
1010
-{
1011
- return !RB_EMPTY_ROOT(&ci->i_caps);
1012
-}
1013
-
10141107 int ceph_is_any_caps(struct inode *inode)
10151108 {
10161109 struct ceph_inode_info *ci = ceph_inode(inode);
10171110 int ret;
10181111
10191112 spin_lock(&ci->i_ceph_lock);
1020
- ret = __ceph_is_any_caps(ci);
1113
+ ret = __ceph_is_any_real_caps(ci);
10211114 spin_unlock(&ci->i_ceph_lock);
10221115
10231116 return ret;
....@@ -1062,8 +1155,10 @@
10621155
10631156 /* remove from inode's cap rbtree, and clear auth cap */
10641157 rb_erase(&cap->ci_node, &ci->i_caps);
1065
- if (ci->i_auth_cap == cap)
1158
+ if (ci->i_auth_cap == cap) {
1159
+ WARN_ON_ONCE(!list_empty(&ci->i_dirty_item));
10661160 ci->i_auth_cap = NULL;
1161
+ }
10671162
10681163 /* remove from session list */
10691164 spin_lock(&session->s_cap_lock);
....@@ -1074,6 +1169,7 @@
10741169 } else {
10751170 list_del_init(&cap->session_caps);
10761171 session->s_nr_caps--;
1172
+ atomic64_dec(&mdsc->metric.total_caps);
10771173 cap->session = NULL;
10781174 removed = 1;
10791175 }
....@@ -1088,9 +1184,7 @@
10881184 (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
10891185 cap->queue_release = 1;
10901186 if (removed) {
1091
- list_add_tail(&cap->session_caps,
1092
- &session->s_cap_releases);
1093
- session->s_num_cap_releases++;
1187
+ __ceph_queue_cap_release(session, cap);
10941188 removed = 0;
10951189 }
10961190 } else {
....@@ -1103,15 +1197,16 @@
11031197 if (removed)
11041198 ceph_put_cap(mdsc, cap);
11051199
1106
- /* when reconnect denied, we remove session caps forcibly,
1107
- * i_wr_ref can be non-zero. If there are ongoing write,
1108
- * keep i_snap_realm.
1109
- */
1110
- if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
1111
- drop_inode_snap_realm(ci);
1200
+ if (!__ceph_is_any_real_caps(ci)) {
1201
+ /* when reconnect denied, we remove session caps forcibly,
1202
+ * i_wr_ref can be non-zero. If there are ongoing write,
1203
+ * keep i_snap_realm.
1204
+ */
1205
+ if (ci->i_wr_ref == 0 && ci->i_snap_realm)
1206
+ drop_inode_snap_realm(ci);
11121207
1113
- if (!__ceph_is_any_real_caps(ci))
11141208 __cap_delay_cancel(mdsc, ci);
1209
+ }
11151210 }
11161211
11171212 struct cap_msg_args {
....@@ -1119,8 +1214,10 @@
11191214 u64 ino, cid, follows;
11201215 u64 flush_tid, oldest_flush_tid, size, max_size;
11211216 u64 xattr_version;
1217
+ u64 change_attr;
11221218 struct ceph_buffer *xattr_buf;
1123
- struct timespec64 atime, mtime, ctime;
1219
+ struct ceph_buffer *old_xattr_buf;
1220
+ struct timespec64 atime, mtime, ctime, btime;
11241221 int op, caps, wanted, dirty;
11251222 u32 seq, issue_seq, mseq, time_warp_seq;
11261223 u32 flags;
....@@ -1128,39 +1225,30 @@
11281225 kgid_t gid;
11291226 umode_t mode;
11301227 bool inline_data;
1228
+ bool wake;
11311229 };
11321230
11331231 /*
1134
- * Build and send a cap message to the given MDS.
1135
- *
1136
- * Caller should be holding s_mutex.
1232
+ * cap struct size + flock buffer size + inline version + inline data size +
1233
+ * osd_epoch_barrier + oldest_flush_tid
11371234 */
1138
-static int send_cap_msg(struct cap_msg_args *arg)
1235
+#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
1236
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
1237
+
1238
+/* Marshal up the cap msg to the MDS */
1239
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
11391240 {
11401241 struct ceph_mds_caps *fc;
1141
- struct ceph_msg *msg;
11421242 void *p;
1143
- size_t extra_len;
1144
- struct timespec64 zerotime = {0};
11451243 struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
11461244
1147
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
1148
- " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
1149
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
1150
- arg->cid, arg->ino, ceph_cap_string(arg->caps),
1151
- ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
1152
- arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
1153
- arg->mseq, arg->follows, arg->size, arg->max_size,
1154
- arg->xattr_version,
1245
+ dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
1246
+ __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
1247
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
1248
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
1249
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
1250
+ arg->size, arg->max_size, arg->xattr_version,
11551251 arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
1156
-
1157
- /* flock buffer size + inline version + inline data size +
1158
- * osd_epoch_barrier + oldest_flush_tid */
1159
- extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
1160
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
1161
- GFP_NOFS, false);
1162
- if (!msg)
1163
- return -ENOMEM;
11641252
11651253 msg->hdr.version = cpu_to_le16(10);
11661254 msg->hdr.tid = cpu_to_le64(arg->flush_tid);
....@@ -1226,29 +1314,20 @@
12261314 /* pool namespace (version 8) (mds always ignores this) */
12271315 ceph_encode_32(&p, 0);
12281316
1229
- /*
1230
- * btime and change_attr (version 9)
1231
- *
1232
- * We just zero these out for now, as the MDS ignores them unless
1233
- * the requisite feature flags are set (which we don't do yet).
1234
- */
1235
- ceph_encode_timespec64(p, &zerotime);
1317
+ /* btime and change_attr (version 9) */
1318
+ ceph_encode_timespec64(p, &arg->btime);
12361319 p += sizeof(struct ceph_timespec);
1237
- ceph_encode_64(&p, 0);
1320
+ ceph_encode_64(&p, arg->change_attr);
12381321
12391322 /* Advisory flags (version 10) */
12401323 ceph_encode_32(&p, arg->flags);
1241
-
1242
- ceph_con_send(&arg->session->s_con, msg);
1243
- return 0;
12441324 }
12451325
12461326 /*
12471327 * Queue cap releases when an inode is dropped from our cache.
12481328 */
1249
-void ceph_queue_caps_release(struct inode *inode)
1329
+void __ceph_remove_caps(struct ceph_inode_info *ci)
12501330 {
1251
- struct ceph_inode_info *ci = ceph_inode(inode);
12521331 struct rb_node *p;
12531332
12541333 /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
....@@ -1264,141 +1343,133 @@
12641343 }
12651344
12661345 /*
1267
- * Send a cap msg on the given inode. Update our caps state, then
1268
- * drop i_ceph_lock and send the message.
1346
+ * Prepare to send a cap message to an MDS. Update the cap state, and populate
1347
+ * the arg struct with the parameters that will need to be sent. This should
1348
+ * be done under the i_ceph_lock to guard against changes to cap state.
12691349 *
12701350 * Make note of max_size reported/requested from mds, revoked caps
12711351 * that have now been implemented.
1272
- *
1273
- * Make half-hearted attempt ot to invalidate page cache if we are
1274
- * dropping RDCACHE. Note that this will leave behind locked pages
1275
- * that we'll then need to deal with elsewhere.
1276
- *
1277
- * Return non-zero if delayed release, or we experienced an error
1278
- * such that the caller should requeue + retry later.
1279
- *
1280
- * called with i_ceph_lock, then drops it.
1281
- * caller should hold snap_rwsem (read), s_mutex.
12821352 */
1283
-static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1284
- int op, bool sync, int used, int want, int retain,
1285
- int flushing, u64 flush_tid, u64 oldest_flush_tid)
1286
- __releases(cap->ci->i_ceph_lock)
1353
+static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
1354
+ int op, int flags, int used, int want, int retain,
1355
+ int flushing, u64 flush_tid, u64 oldest_flush_tid)
12871356 {
12881357 struct ceph_inode_info *ci = cap->ci;
12891358 struct inode *inode = &ci->vfs_inode;
1290
- struct ceph_buffer *old_blob = NULL;
1291
- struct cap_msg_args arg;
12921359 int held, revoking;
1293
- int wake = 0;
1294
- int delayed = 0;
1295
- int ret;
1360
+
1361
+ lockdep_assert_held(&ci->i_ceph_lock);
12961362
12971363 held = cap->issued | cap->implemented;
12981364 revoking = cap->implemented & ~cap->issued;
12991365 retain &= ~revoking;
13001366
1301
- dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1302
- inode, cap, cap->session,
1367
+ dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
1368
+ __func__, inode, cap, cap->session,
13031369 ceph_cap_string(held), ceph_cap_string(held & retain),
13041370 ceph_cap_string(revoking));
13051371 BUG_ON((retain & CEPH_CAP_PIN) == 0);
13061372
1307
- arg.session = cap->session;
1308
-
1309
- /* don't release wanted unless we've waited a bit. */
1310
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1311
- time_before(jiffies, ci->i_hold_caps_min)) {
1312
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1313
- ceph_cap_string(cap->issued),
1314
- ceph_cap_string(cap->issued & retain),
1315
- ceph_cap_string(cap->mds_wanted),
1316
- ceph_cap_string(want));
1317
- want |= cap->mds_wanted;
1318
- retain |= cap->issued;
1319
- delayed = 1;
1320
- }
1321
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1322
- if (want & ~cap->mds_wanted) {
1323
- /* user space may open/close single file frequently.
1324
- * This avoids droping mds_wanted immediately after
1325
- * requesting new mds_wanted.
1326
- */
1327
- __cap_set_timeouts(mdsc, ci);
1328
- }
1373
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH;
13291374
13301375 cap->issued &= retain; /* drop bits we don't want */
1331
- if (cap->implemented & ~cap->issued) {
1332
- /*
1333
- * Wake up any waiters on wanted -> needed transition.
1334
- * This is due to the weird transition from buffered
1335
- * to sync IO... we need to flush dirty pages _before_
1336
- * allowing sync writes to avoid reordering.
1337
- */
1338
- wake = 1;
1339
- }
1376
+ /*
1377
+ * Wake up any waiters on wanted -> needed transition. This is due to
1378
+ * the weird transition from buffered to sync IO... we need to flush
1379
+ * dirty pages _before_ allowing sync writes to avoid reordering.
1380
+ */
1381
+ arg->wake = cap->implemented & ~cap->issued;
13401382 cap->implemented &= cap->issued | used;
13411383 cap->mds_wanted = want;
13421384
1343
- arg.ino = ceph_vino(inode).ino;
1344
- arg.cid = cap->cap_id;
1345
- arg.follows = flushing ? ci->i_head_snapc->seq : 0;
1346
- arg.flush_tid = flush_tid;
1347
- arg.oldest_flush_tid = oldest_flush_tid;
1385
+ arg->session = cap->session;
1386
+ arg->ino = ceph_vino(inode).ino;
1387
+ arg->cid = cap->cap_id;
1388
+ arg->follows = flushing ? ci->i_head_snapc->seq : 0;
1389
+ arg->flush_tid = flush_tid;
1390
+ arg->oldest_flush_tid = oldest_flush_tid;
13481391
1349
- arg.size = inode->i_size;
1350
- ci->i_reported_size = arg.size;
1351
- arg.max_size = ci->i_wanted_max_size;
1352
- ci->i_requested_max_size = arg.max_size;
1392
+ arg->size = inode->i_size;
1393
+ ci->i_reported_size = arg->size;
1394
+ arg->max_size = ci->i_wanted_max_size;
1395
+ if (cap == ci->i_auth_cap) {
1396
+ if (want & CEPH_CAP_ANY_FILE_WR)
1397
+ ci->i_requested_max_size = arg->max_size;
1398
+ else
1399
+ ci->i_requested_max_size = 0;
1400
+ }
13531401
13541402 if (flushing & CEPH_CAP_XATTR_EXCL) {
1355
- old_blob = __ceph_build_xattrs_blob(ci);
1356
- arg.xattr_version = ci->i_xattrs.version;
1357
- arg.xattr_buf = ci->i_xattrs.blob;
1403
+ arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
1404
+ arg->xattr_version = ci->i_xattrs.version;
1405
+ arg->xattr_buf = ci->i_xattrs.blob;
13581406 } else {
1359
- arg.xattr_buf = NULL;
1407
+ arg->xattr_buf = NULL;
1408
+ arg->old_xattr_buf = NULL;
13601409 }
13611410
1362
- arg.mtime = inode->i_mtime;
1363
- arg.atime = inode->i_atime;
1364
- arg.ctime = inode->i_ctime;
1411
+ arg->mtime = inode->i_mtime;
1412
+ arg->atime = inode->i_atime;
1413
+ arg->ctime = inode->i_ctime;
1414
+ arg->btime = ci->i_btime;
1415
+ arg->change_attr = inode_peek_iversion_raw(inode);
13651416
1366
- arg.op = op;
1367
- arg.caps = cap->implemented;
1368
- arg.wanted = want;
1369
- arg.dirty = flushing;
1417
+ arg->op = op;
1418
+ arg->caps = cap->implemented;
1419
+ arg->wanted = want;
1420
+ arg->dirty = flushing;
13701421
1371
- arg.seq = cap->seq;
1372
- arg.issue_seq = cap->issue_seq;
1373
- arg.mseq = cap->mseq;
1374
- arg.time_warp_seq = ci->i_time_warp_seq;
1422
+ arg->seq = cap->seq;
1423
+ arg->issue_seq = cap->issue_seq;
1424
+ arg->mseq = cap->mseq;
1425
+ arg->time_warp_seq = ci->i_time_warp_seq;
13751426
1376
- arg.uid = inode->i_uid;
1377
- arg.gid = inode->i_gid;
1378
- arg.mode = inode->i_mode;
1427
+ arg->uid = inode->i_uid;
1428
+ arg->gid = inode->i_gid;
1429
+ arg->mode = inode->i_mode;
13791430
1380
- arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1381
- if (list_empty(&ci->i_cap_snaps))
1382
- arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
1383
- else
1384
- arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1385
- if (sync)
1386
- arg.flags |= CEPH_CLIENT_CAPS_SYNC;
1431
+ arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1432
+ if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1433
+ !list_empty(&ci->i_cap_snaps)) {
1434
+ struct ceph_cap_snap *capsnap;
1435
+ list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1436
+ if (capsnap->cap_flush.tid)
1437
+ break;
1438
+ if (capsnap->need_flush) {
1439
+ flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1440
+ break;
1441
+ }
1442
+ }
1443
+ }
1444
+ arg->flags = flags;
1445
+}
13871446
1388
- spin_unlock(&ci->i_ceph_lock);
1447
+/*
1448
+ * Send a cap msg on the given inode.
1449
+ *
1450
+ * Caller should hold snap_rwsem (read), s_mutex.
1451
+ */
1452
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
1453
+{
1454
+ struct ceph_msg *msg;
1455
+ struct inode *inode = &ci->vfs_inode;
13891456
1390
- ceph_buffer_put(old_blob);
1391
-
1392
- ret = send_cap_msg(&arg);
1393
- if (ret < 0) {
1394
- dout("error sending cap msg, must requeue %p\n", inode);
1395
- delayed = 1;
1457
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
1458
+ if (!msg) {
1459
+ pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
1460
+ ceph_vinop(inode), ceph_cap_string(arg->dirty),
1461
+ arg->flush_tid);
1462
+ spin_lock(&ci->i_ceph_lock);
1463
+ __cap_delay_requeue(arg->session->s_mdsc, ci);
1464
+ spin_unlock(&ci->i_ceph_lock);
1465
+ return;
13961466 }
13971467
1398
- if (wake)
1468
+ encode_cap_msg(msg, arg);
1469
+ ceph_con_send(&arg->session->s_con, msg);
1470
+ ceph_buffer_put(arg->old_xattr_buf);
1471
+ if (arg->wake)
13991472 wake_up_all(&ci->i_cap_wq);
1400
-
1401
- return delayed;
14021473 }
14031474
14041475 static inline int __send_flush_snap(struct inode *inode,
....@@ -1407,6 +1478,11 @@
14071478 u32 mseq, u64 oldest_flush_tid)
14081479 {
14091480 struct cap_msg_args arg;
1481
+ struct ceph_msg *msg;
1482
+
1483
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
1484
+ if (!msg)
1485
+ return -ENOMEM;
14101486
14111487 arg.session = session;
14121488 arg.ino = ceph_vino(inode).ino;
....@@ -1419,10 +1495,13 @@
14191495 arg.max_size = 0;
14201496 arg.xattr_version = capsnap->xattr_version;
14211497 arg.xattr_buf = capsnap->xattr_blob;
1498
+ arg.old_xattr_buf = NULL;
14221499
14231500 arg.atime = capsnap->atime;
14241501 arg.mtime = capsnap->mtime;
14251502 arg.ctime = capsnap->ctime;
1503
+ arg.btime = capsnap->btime;
1504
+ arg.change_attr = capsnap->change_attr;
14261505
14271506 arg.op = CEPH_CAP_OP_FLUSHSNAP;
14281507 arg.caps = capsnap->issued;
....@@ -1440,8 +1519,11 @@
14401519
14411520 arg.inline_data = capsnap->inline_data;
14421521 arg.flags = 0;
1522
+ arg.wake = false;
14431523
1444
- return send_cap_msg(&arg);
1524
+ encode_cap_msg(msg, &arg);
1525
+ ceph_con_send(&arg.session->s_con, msg);
1526
+ return 0;
14451527 }
14461528
14471529 /*
....@@ -1590,10 +1672,8 @@
15901672 }
15911673
15921674 // make sure flushsnap messages are sent in proper order.
1593
- if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
1675
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
15941676 __kick_flushing_caps(mdsc, session, ci, 0);
1595
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
1596
- }
15971677
15981678 __ceph_flush_snaps(ci, session);
15991679 out:
....@@ -1625,6 +1705,8 @@
16251705 int was = ci->i_dirty_caps;
16261706 int dirty = 0;
16271707
1708
+ lockdep_assert_held(&ci->i_ceph_lock);
1709
+
16281710 if (!ci->i_auth_cap) {
16291711 pr_warn("__mark_dirty_caps %p %llx mask %s, "
16301712 "but no auth cap (session was closed?)\n",
....@@ -1637,6 +1719,8 @@
16371719 ceph_cap_string(was | mask));
16381720 ci->i_dirty_caps |= mask;
16391721 if (was == 0) {
1722
+ struct ceph_mds_session *session = ci->i_auth_cap->session;
1723
+
16401724 WARN_ON_ONCE(ci->i_prealloc_cap_flush);
16411725 swap(ci->i_prealloc_cap_flush, *pcf);
16421726
....@@ -1649,7 +1733,7 @@
16491733 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
16501734 BUG_ON(!list_empty(&ci->i_dirty_item));
16511735 spin_lock(&mdsc->cap_dirty_lock);
1652
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1736
+ list_add(&ci->i_dirty_item, &session->s_cap_dirty);
16531737 spin_unlock(&mdsc->cap_dirty_lock);
16541738 if (ci->i_flushing_caps == 0) {
16551739 ihold(inode);
....@@ -1668,7 +1752,14 @@
16681752
16691753 struct ceph_cap_flush *ceph_alloc_cap_flush(void)
16701754 {
1671
- return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
1755
+ struct ceph_cap_flush *cf;
1756
+
1757
+ cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
1758
+ if (!cf)
1759
+ return NULL;
1760
+
1761
+ cf->is_capsnap = false;
1762
+ return cf;
16721763 }
16731764
16741765 void ceph_free_cap_flush(struct ceph_cap_flush *cf)
....@@ -1692,30 +1783,33 @@
16921783 * Remove cap_flush from the mdsc's or inode's flushing cap list.
16931784 * Return true if caller needs to wake up flush waiters.
16941785 */
1695
-static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
1696
- struct ceph_inode_info *ci,
1697
- struct ceph_cap_flush *cf)
1786
+static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1787
+ struct ceph_cap_flush *cf)
16981788 {
16991789 struct ceph_cap_flush *prev;
17001790 bool wake = cf->wake;
1701
- if (mdsc) {
1702
- /* are there older pending cap flushes? */
1703
- if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1704
- prev = list_prev_entry(cf, g_list);
1705
- prev->wake = true;
1706
- wake = false;
1707
- }
1708
- list_del(&cf->g_list);
1709
- } else if (ci) {
1710
- if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1711
- prev = list_prev_entry(cf, i_list);
1712
- prev->wake = true;
1713
- wake = false;
1714
- }
1715
- list_del(&cf->i_list);
1716
- } else {
1717
- BUG_ON(1);
1791
+
1792
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1793
+ prev = list_prev_entry(cf, g_list);
1794
+ prev->wake = true;
1795
+ wake = false;
17181796 }
1797
+ list_del_init(&cf->g_list);
1798
+ return wake;
1799
+}
1800
+
1801
+static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1802
+ struct ceph_cap_flush *cf)
1803
+{
1804
+ struct ceph_cap_flush *prev;
1805
+ bool wake = cf->wake;
1806
+
1807
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1808
+ prev = list_prev_entry(cf, i_list);
1809
+ prev->wake = true;
1810
+ wake = false;
1811
+ }
1812
+ list_del_init(&cf->i_list);
17191813 return wake;
17201814 }
17211815
....@@ -1723,17 +1817,18 @@
17231817 * Add dirty inode to the flushing list. Assigned a seq number so we
17241818 * can wait for caps to flush without starving.
17251819 *
1726
- * Called under i_ceph_lock.
1820
+ * Called under i_ceph_lock. Returns the flush tid.
17271821 */
1728
-static int __mark_caps_flushing(struct inode *inode,
1822
+static u64 __mark_caps_flushing(struct inode *inode,
17291823 struct ceph_mds_session *session, bool wake,
1730
- u64 *flush_tid, u64 *oldest_flush_tid)
1824
+ u64 *oldest_flush_tid)
17311825 {
17321826 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
17331827 struct ceph_inode_info *ci = ceph_inode(inode);
17341828 struct ceph_cap_flush *cf = NULL;
17351829 int flushing;
17361830
1831
+ lockdep_assert_held(&ci->i_ceph_lock);
17371832 BUG_ON(ci->i_dirty_caps == 0);
17381833 BUG_ON(list_empty(&ci->i_dirty_item));
17391834 BUG_ON(!ci->i_prealloc_cap_flush);
....@@ -1766,8 +1861,7 @@
17661861
17671862 list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
17681863
1769
- *flush_tid = cf->tid;
1770
- return flushing;
1864
+ return cf->tid;
17711865 }
17721866
17731867 /*
....@@ -1817,8 +1911,6 @@
18171911 * versus held caps. Release, flush, ack revoked caps to mds as
18181912 * appropriate.
18191913 *
1820
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1821
- * cap release further.
18221914 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
18231915 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
18241916 * further delay.
....@@ -1826,9 +1918,8 @@
18261918 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
18271919 struct ceph_mds_session *session)
18281920 {
1829
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1830
- struct ceph_mds_client *mdsc = fsc->mdsc;
18311921 struct inode *inode = &ci->vfs_inode;
1922
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
18321923 struct ceph_cap *cap;
18331924 u64 flush_tid, oldest_flush_tid;
18341925 int file_wanted, used, cap_used;
....@@ -1837,48 +1928,53 @@
18371928 int mds = -1; /* keep track of how far we've gone through i_caps list
18381929 to avoid an infinite loop on retry */
18391930 struct rb_node *p;
1840
- int delayed = 0, sent = 0;
1841
- bool no_delay = flags & CHECK_CAPS_NODELAY;
18421931 bool queue_invalidate = false;
18431932 bool tried_invalidate = false;
18441933
1845
- /* if we are unmounting, flush any unused caps immediately. */
1846
- if (mdsc->stopping)
1847
- no_delay = true;
1848
-
18491934 spin_lock(&ci->i_ceph_lock);
1850
-
18511935 if (ci->i_ceph_flags & CEPH_I_FLUSH)
18521936 flags |= CHECK_CAPS_FLUSH;
1853
-
1854
- if (!(flags & CHECK_CAPS_AUTHONLY) ||
1855
- (ci->i_auth_cap && __ceph_is_single_caps(ci)))
1856
- __cap_delay_cancel(mdsc, ci);
18571937
18581938 goto retry_locked;
18591939 retry:
18601940 spin_lock(&ci->i_ceph_lock);
18611941 retry_locked:
1942
+ /* Caps wanted by virtue of active open files. */
18621943 file_wanted = __ceph_caps_file_wanted(ci);
1944
+
1945
+ /* Caps which have active references against them */
18631946 used = __ceph_caps_used(ci);
1947
+
1948
+ /*
1949
+ * "issued" represents the current caps that the MDS wants us to have.
1950
+ * "implemented" is the set that we have been granted, and includes the
1951
+ * ones that have not yet been returned to the MDS (the "revoking" set,
1952
+ * usually because they have outstanding references).
1953
+ */
18641954 issued = __ceph_caps_issued(ci, &implemented);
18651955 revoking = implemented & ~issued;
18661956
18671957 want = file_wanted;
1958
+
1959
+ /* The ones we currently want to retain (may be adjusted below) */
18681960 retain = file_wanted | used | CEPH_CAP_PIN;
18691961 if (!mdsc->stopping && inode->i_nlink > 0) {
18701962 if (file_wanted) {
18711963 retain |= CEPH_CAP_ANY; /* be greedy */
18721964 } else if (S_ISDIR(inode->i_mode) &&
18731965 (issued & CEPH_CAP_FILE_SHARED) &&
1874
- __ceph_dir_is_complete(ci)) {
1966
+ __ceph_dir_is_complete(ci)) {
18751967 /*
18761968 * If a directory is complete, we want to keep
18771969 * the exclusive cap. So that MDS does not end up
18781970 * revoking the shared cap on every create/unlink
18791971 * operation.
18801972 */
1881
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1973
+ if (IS_RDONLY(inode)) {
1974
+ want = CEPH_CAP_ANY_SHARED;
1975
+ } else {
1976
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1977
+ }
18821978 retain |= want;
18831979 } else {
18841980
....@@ -1894,14 +1990,13 @@
18941990 }
18951991
18961992 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1897
- " issued %s revoking %s retain %s %s%s%s\n", inode,
1993
+ " issued %s revoking %s retain %s %s%s\n", inode,
18981994 ceph_cap_string(file_wanted),
18991995 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
19001996 ceph_cap_string(ci->i_flushing_caps),
19011997 ceph_cap_string(issued), ceph_cap_string(revoking),
19021998 ceph_cap_string(retain),
19031999 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1904
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
19052000 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
19062001
19072002 /*
....@@ -1909,8 +2004,8 @@
19092004 * have cached pages, but don't want them, then try to invalidate.
19102005 * If we fail, it's because pages are locked.... try again later.
19112006 */
1912
- if ((!no_delay || mdsc->stopping) &&
1913
- !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
2007
+ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
2008
+ S_ISREG(inode->i_mode) &&
19142009 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
19152010 inode->i_data.nrpages && /* have cached pages */
19162011 (revoking & (CEPH_CAP_FILE_CACHE|
....@@ -1927,6 +2022,9 @@
19272022 }
19282023
19292024 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2025
+ int mflags = 0;
2026
+ struct cap_msg_args arg;
2027
+
19302028 cap = rb_entry(p, struct ceph_cap, ci_node);
19312029
19322030 /* avoid looping forever */
....@@ -1936,6 +2034,10 @@
19362034
19372035 /* NOTE: no side-effects allowed, until we take s_mutex */
19382036
2037
+ /*
2038
+ * If we have an auth cap, we don't need to consider any
2039
+ * overlapping caps as used.
2040
+ */
19392041 cap_used = used;
19402042 if (ci->i_auth_cap && cap != ci->i_auth_cap)
19412043 cap_used &= ~ci->i_auth_cap->issued;
....@@ -1990,31 +2092,10 @@
19902092 }
19912093
19922094 /* things we might delay */
1993
- if ((cap->issued & ~retain) == 0 &&
1994
- cap->mds_wanted == want)
2095
+ if ((cap->issued & ~retain) == 0)
19952096 continue; /* nope, all good */
19962097
1997
- if (no_delay)
1998
- goto ack;
1999
-
2000
- /* delay? */
2001
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
2002
- time_before(jiffies, ci->i_hold_caps_max)) {
2003
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
2004
- ceph_cap_string(cap->issued),
2005
- ceph_cap_string(cap->issued & retain),
2006
- ceph_cap_string(cap->mds_wanted),
2007
- ceph_cap_string(want));
2008
- delayed++;
2009
- continue;
2010
- }
2011
-
20122098 ack:
2013
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2014
- dout(" skipping %p I_NOFLUSH set\n", inode);
2015
- continue;
2016
- }
2017
-
20182099 if (session && session != cap->session) {
20192100 dout("oops, wrong session %p mutex\n", session);
20202101 mutex_unlock(&session->s_mutex);
....@@ -2052,10 +2133,8 @@
20522133 if (cap == ci->i_auth_cap &&
20532134 (ci->i_ceph_flags &
20542135 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
2055
- if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2136
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
20562137 __kick_flushing_caps(mdsc, session, ci, 0);
2057
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2058
- }
20592138 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
20602139 __ceph_flush_snaps(ci, session);
20612140
....@@ -2076,9 +2155,12 @@
20762155 }
20772156
20782157 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
2079
- flushing = __mark_caps_flushing(inode, session, false,
2080
- &flush_tid,
2081
- &oldest_flush_tid);
2158
+ flushing = ci->i_dirty_caps;
2159
+ flush_tid = __mark_caps_flushing(inode, session, false,
2160
+ &oldest_flush_tid);
2161
+ if (flags & CHECK_CAPS_FLUSH &&
2162
+ list_empty(&session->s_cap_dirty))
2163
+ mflags |= CEPH_CLIENT_CAPS_SYNC;
20822164 } else {
20832165 flushing = 0;
20842166 flush_tid = 0;
....@@ -2088,18 +2170,23 @@
20882170 }
20892171
20902172 mds = cap->mds; /* remember mds, so we don't repeat */
2091
- sent++;
20922173
2093
- /* __send_cap drops i_ceph_lock */
2094
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
2095
- cap_used, want, retain, flushing,
2096
- flush_tid, oldest_flush_tid);
2174
+ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
2175
+ want, retain, flushing, flush_tid, oldest_flush_tid);
2176
+ spin_unlock(&ci->i_ceph_lock);
2177
+
2178
+ __send_cap(&arg, ci);
2179
+
20972180 goto retry; /* retake i_ceph_lock and restart our cap scan. */
20982181 }
20992182
2100
- /* Reschedule delayed caps release if we delayed anything */
2101
- if (delayed)
2183
+ /* periodically re-calculate caps wanted by open files */
2184
+ if (__ceph_is_any_real_caps(ci) &&
2185
+ list_empty(&ci->i_cap_delay_list) &&
2186
+ (file_wanted & ~CEPH_CAP_PIN) &&
2187
+ !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
21022188 __cap_delay_requeue(mdsc, ci);
2189
+ }
21032190
21042191 spin_unlock(&ci->i_ceph_lock);
21052192
....@@ -2125,18 +2212,12 @@
21252212
21262213 retry:
21272214 spin_lock(&ci->i_ceph_lock);
2128
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2129
- spin_unlock(&ci->i_ceph_lock);
2130
- dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
2131
- goto out;
2132
- }
2215
+retry_locked:
21332216 if (ci->i_dirty_caps && ci->i_auth_cap) {
21342217 struct ceph_cap *cap = ci->i_auth_cap;
2135
- int used = __ceph_caps_used(ci);
2136
- int want = __ceph_caps_wanted(ci);
2137
- int delayed;
2218
+ struct cap_msg_args arg;
21382219
2139
- if (!session || session != cap->session) {
2220
+ if (session != cap->session) {
21402221 spin_unlock(&ci->i_ceph_lock);
21412222 if (session)
21422223 mutex_unlock(&session->s_mutex);
....@@ -2149,19 +2230,26 @@
21492230 goto out;
21502231 }
21512232
2152
- flushing = __mark_caps_flushing(inode, session, true,
2153
- &flush_tid, &oldest_flush_tid);
2154
-
2155
- /* __send_cap drops i_ceph_lock */
2156
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
2157
- used, want, (cap->issued | cap->implemented),
2158
- flushing, flush_tid, oldest_flush_tid);
2159
-
2160
- if (delayed) {
2161
- spin_lock(&ci->i_ceph_lock);
2162
- __cap_delay_requeue(mdsc, ci);
2163
- spin_unlock(&ci->i_ceph_lock);
2233
+ if (ci->i_ceph_flags &
2234
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
2235
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2236
+ __kick_flushing_caps(mdsc, session, ci, 0);
2237
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2238
+ __ceph_flush_snaps(ci, session);
2239
+ goto retry_locked;
21642240 }
2241
+
2242
+ flushing = ci->i_dirty_caps;
2243
+ flush_tid = __mark_caps_flushing(inode, session, true,
2244
+ &oldest_flush_tid);
2245
+
2246
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
2247
+ __ceph_caps_used(ci), __ceph_caps_wanted(ci),
2248
+ (cap->issued | cap->implemented),
2249
+ flushing, flush_tid, oldest_flush_tid);
2250
+ spin_unlock(&ci->i_ceph_lock);
2251
+
2252
+ __send_cap(&arg, ci);
21652253 } else {
21662254 if (!list_empty(&ci->i_cap_flush_list)) {
21672255 struct ceph_cap_flush *cf =
....@@ -2206,6 +2294,7 @@
22062294 */
22072295 static int unsafe_request_wait(struct inode *inode)
22082296 {
2297
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
22092298 struct ceph_inode_info *ci = ceph_inode(inode);
22102299 struct ceph_mds_request *req1 = NULL, *req2 = NULL;
22112300 int ret, err = 0;
....@@ -2225,6 +2314,76 @@
22252314 }
22262315 spin_unlock(&ci->i_unsafe_lock);
22272316
2317
+ /*
2318
+ * Trigger to flush the journal logs in all the relevant MDSes
2319
+ * manually, or in the worst case we must wait at most 5 seconds
2320
+ * to wait the journal logs to be flushed by the MDSes periodically.
2321
+ */
2322
+ if (req1 || req2) {
2323
+ struct ceph_mds_request *req;
2324
+ struct ceph_mds_session **sessions;
2325
+ struct ceph_mds_session *s;
2326
+ unsigned int max_sessions;
2327
+ int i;
2328
+
2329
+ mutex_lock(&mdsc->mutex);
2330
+ max_sessions = mdsc->max_sessions;
2331
+
2332
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
2333
+ if (!sessions) {
2334
+ mutex_unlock(&mdsc->mutex);
2335
+ err = -ENOMEM;
2336
+ goto out;
2337
+ }
2338
+
2339
+ spin_lock(&ci->i_unsafe_lock);
2340
+ if (req1) {
2341
+ list_for_each_entry(req, &ci->i_unsafe_dirops,
2342
+ r_unsafe_dir_item) {
2343
+ s = req->r_session;
2344
+ if (!s)
2345
+ continue;
2346
+ if (!sessions[s->s_mds]) {
2347
+ s = ceph_get_mds_session(s);
2348
+ sessions[s->s_mds] = s;
2349
+ }
2350
+ }
2351
+ }
2352
+ if (req2) {
2353
+ list_for_each_entry(req, &ci->i_unsafe_iops,
2354
+ r_unsafe_target_item) {
2355
+ s = req->r_session;
2356
+ if (!s)
2357
+ continue;
2358
+ if (!sessions[s->s_mds]) {
2359
+ s = ceph_get_mds_session(s);
2360
+ sessions[s->s_mds] = s;
2361
+ }
2362
+ }
2363
+ }
2364
+ spin_unlock(&ci->i_unsafe_lock);
2365
+
2366
+ /* the auth MDS */
2367
+ spin_lock(&ci->i_ceph_lock);
2368
+ if (ci->i_auth_cap) {
2369
+ s = ci->i_auth_cap->session;
2370
+ if (!sessions[s->s_mds])
2371
+ sessions[s->s_mds] = ceph_get_mds_session(s);
2372
+ }
2373
+ spin_unlock(&ci->i_ceph_lock);
2374
+ mutex_unlock(&mdsc->mutex);
2375
+
2376
+ /* send flush mdlog request to MDSes */
2377
+ for (i = 0; i < max_sessions; i++) {
2378
+ s = sessions[i];
2379
+ if (s) {
2380
+ send_flush_mdlog(s);
2381
+ ceph_put_mds_session(s);
2382
+ }
2383
+ }
2384
+ kfree(sessions);
2385
+ }
2386
+
22282387 dout("unsafe_request_wait %p wait on tid %llu %llu\n",
22292388 inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
22302389 if (req1) {
....@@ -2232,15 +2391,19 @@
22322391 ceph_timeout_jiffies(req1->r_timeout));
22332392 if (ret)
22342393 err = -EIO;
2235
- ceph_mdsc_put_request(req1);
22362394 }
22372395 if (req2) {
22382396 ret = !wait_for_completion_timeout(&req2->r_safe_completion,
22392397 ceph_timeout_jiffies(req2->r_timeout));
22402398 if (ret)
22412399 err = -EIO;
2242
- ceph_mdsc_put_request(req2);
22432400 }
2401
+
2402
+out:
2403
+ if (req1)
2404
+ ceph_mdsc_put_request(req1);
2405
+ if (req2)
2406
+ ceph_mdsc_put_request(req2);
22442407 return err;
22452408 }
22462409
....@@ -2249,35 +2412,40 @@
22492412 struct inode *inode = file->f_mapping->host;
22502413 struct ceph_inode_info *ci = ceph_inode(inode);
22512414 u64 flush_tid;
2252
- int ret;
2415
+ int ret, err;
22532416 int dirty;
22542417
22552418 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
22562419
22572420 ret = file_write_and_wait_range(file, start, end);
2258
- if (ret < 0)
2259
- goto out;
2260
-
22612421 if (datasync)
22622422 goto out;
22632423
2264
- inode_lock(inode);
2424
+ ret = ceph_wait_on_async_create(inode);
2425
+ if (ret)
2426
+ goto out;
22652427
22662428 dirty = try_flush_caps(inode, &flush_tid);
22672429 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
22682430
2269
- ret = unsafe_request_wait(inode);
2431
+ err = unsafe_request_wait(inode);
22702432
22712433 /*
22722434 * only wait on non-file metadata writeback (the mds
22732435 * can recover size and mtime, so we don't need to
22742436 * wait for that)
22752437 */
2276
- if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2277
- ret = wait_event_interruptible(ci->i_cap_wq,
2438
+ if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2439
+ err = wait_event_interruptible(ci->i_cap_wq,
22782440 caps_are_flushed(inode, flush_tid));
22792441 }
2280
- inode_unlock(inode);
2442
+
2443
+ if (err < 0)
2444
+ ret = err;
2445
+
2446
+ err = file_check_and_advance_wb_err(file);
2447
+ if (err < 0)
2448
+ ret = err;
22812449 out:
22822450 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
22832451 return ret;
....@@ -2327,6 +2495,16 @@
23272495 struct ceph_cap_flush *cf;
23282496 int ret;
23292497 u64 first_tid = 0;
2498
+ u64 last_snap_flush = 0;
2499
+
2500
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2501
+
2502
+ list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2503
+ if (cf->is_capsnap) {
2504
+ last_snap_flush = cf->tid;
2505
+ break;
2506
+ }
2507
+ }
23302508
23312509 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
23322510 if (cf->tid < first_tid)
....@@ -2341,22 +2519,20 @@
23412519
23422520 first_tid = cf->tid + 1;
23432521
2344
- if (cf->caps) {
2522
+ if (!cf->is_capsnap) {
2523
+ struct cap_msg_args arg;
2524
+
23452525 dout("kick_flushing_caps %p cap %p tid %llu %s\n",
23462526 inode, cap, cf->tid, ceph_cap_string(cf->caps));
2347
- ci->i_ceph_flags |= CEPH_I_NODELAY;
2348
- ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
2349
- false, __ceph_caps_used(ci),
2527
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
2528
+ (cf->tid < last_snap_flush ?
2529
+ CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
2530
+ __ceph_caps_used(ci),
23502531 __ceph_caps_wanted(ci),
2351
- cap->issued | cap->implemented,
2532
+ (cap->issued | cap->implemented),
23522533 cf->caps, cf->tid, oldest_flush_tid);
2353
- if (ret) {
2354
- pr_err("kick_flushing_caps: error sending "
2355
- "cap flush, ino (%llx.%llx) "
2356
- "tid %llu flushing %s\n",
2357
- ceph_vinop(inode), cf->tid,
2358
- ceph_cap_string(cf->caps));
2359
- }
2534
+ spin_unlock(&ci->i_ceph_lock);
2535
+ __send_cap(&arg, ci);
23602536 } else {
23612537 struct ceph_cap_snap *capsnap =
23622538 container_of(cf, struct ceph_cap_snap,
....@@ -2417,7 +2593,12 @@
24172593 */
24182594 if ((cap->issued & ci->i_flushing_caps) !=
24192595 ci->i_flushing_caps) {
2420
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2596
+ /* encode_caps_cb() also will reset these sequence
2597
+ * numbers. make sure sequence numbers in cap flush
2598
+ * message match later reconnect message */
2599
+ cap->seq = 0;
2600
+ cap->issue_seq = 0;
2601
+ cap->mseq = 0;
24212602 __kick_flushing_caps(mdsc, session, ci,
24222603 oldest_flush_tid);
24232604 } else {
....@@ -2435,6 +2616,8 @@
24352616 struct ceph_cap *cap;
24362617 u64 oldest_flush_tid;
24372618
2619
+ lockdep_assert_held(&session->s_mutex);
2620
+
24382621 dout("kick_flushing_caps mds%d\n", session->s_mds);
24392622
24402623 spin_lock(&mdsc->cap_dirty_lock);
....@@ -2451,7 +2634,6 @@
24512634 continue;
24522635 }
24532636 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2454
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
24552637 __kick_flushing_caps(mdsc, session, ci,
24562638 oldest_flush_tid);
24572639 }
....@@ -2459,16 +2641,15 @@
24592641 }
24602642 }
24612643
2462
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
2463
- struct ceph_mds_session *session,
2464
- struct inode *inode)
2465
- __releases(ci->i_ceph_lock)
2644
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2645
+ struct ceph_inode_info *ci)
24662646 {
2467
- struct ceph_inode_info *ci = ceph_inode(inode);
2468
- struct ceph_cap *cap;
2647
+ struct ceph_mds_client *mdsc = session->s_mdsc;
2648
+ struct ceph_cap *cap = ci->i_auth_cap;
24692649
2470
- cap = ci->i_auth_cap;
2471
- dout("kick_flushing_inode_caps %p flushing %s\n", inode,
2650
+ lockdep_assert_held(&ci->i_ceph_lock);
2651
+
2652
+ dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
24722653 ceph_cap_string(ci->i_flushing_caps));
24732654
24742655 if (!list_empty(&ci->i_cap_flush_list)) {
....@@ -2479,11 +2660,7 @@
24792660 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
24802661 spin_unlock(&mdsc->cap_dirty_lock);
24812662
2482
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
24832663 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2484
- spin_unlock(&ci->i_ceph_lock);
2485
- } else {
2486
- spin_unlock(&ci->i_ceph_lock);
24872664 }
24882665 }
24892666
....@@ -2491,18 +2668,20 @@
24912668 /*
24922669 * Take references to capabilities we hold, so that we don't release
24932670 * them to the MDS prematurely.
2494
- *
2495
- * Protected by i_ceph_lock.
24962671 */
2497
-static void __take_cap_refs(struct ceph_inode_info *ci, int got,
2672
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
24982673 bool snap_rwsem_locked)
24992674 {
2675
+ lockdep_assert_held(&ci->i_ceph_lock);
2676
+
25002677 if (got & CEPH_CAP_PIN)
25012678 ci->i_pin_ref++;
25022679 if (got & CEPH_CAP_FILE_RD)
25032680 ci->i_rd_ref++;
25042681 if (got & CEPH_CAP_FILE_CACHE)
25052682 ci->i_rdcache_ref++;
2683
+ if (got & CEPH_CAP_FILE_EXCL)
2684
+ ci->i_fx_ref++;
25062685 if (got & CEPH_CAP_FILE_WR) {
25072686 if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
25082687 BUG_ON(!snap_rwsem_locked);
....@@ -2515,7 +2694,7 @@
25152694 if (ci->i_wb_ref == 0)
25162695 ihold(&ci->vfs_inode);
25172696 ci->i_wb_ref++;
2518
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
2697
+ dout("%s %p wb %d -> %d (?)\n", __func__,
25192698 &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
25202699 }
25212700 }
....@@ -2526,15 +2705,26 @@
25262705 * to (when applicable), and check against max_size here as well.
25272706 * Note that caller is responsible for ensuring max_size increases are
25282707 * requested from the MDS.
2708
+ *
2709
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2710
+ * or a negative error code. There are 3 speical error codes:
2711
+ * -EAGAIN: need to sleep but non-blocking is specified
2712
+ * -EFBIG: ask caller to call check_max_size() and try again.
2713
+ * -ESTALE: ask caller to call ceph_renew_caps() and try again.
25292714 */
2530
-static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2531
- loff_t endoff, bool nonblock, int *got, int *err)
2715
+enum {
2716
+ /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
2717
+ NON_BLOCKING = (1 << 8),
2718
+ CHECK_FILELOCK = (1 << 9),
2719
+};
2720
+
2721
+static int try_get_cap_refs(struct inode *inode, int need, int want,
2722
+ loff_t endoff, int flags, int *got)
25322723 {
2533
- struct inode *inode = &ci->vfs_inode;
2724
+ struct ceph_inode_info *ci = ceph_inode(inode);
25342725 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
25352726 int ret = 0;
25362727 int have, implemented;
2537
- int file_wanted;
25382728 bool snap_rwsem_locked = false;
25392729
25402730 dout("get_cap_refs %p need %s want %s\n", inode,
....@@ -2543,13 +2733,10 @@
25432733 again:
25442734 spin_lock(&ci->i_ceph_lock);
25452735
2546
- /* make sure file is actually open */
2547
- file_wanted = __ceph_caps_file_wanted(ci);
2548
- if ((file_wanted & need) != need) {
2549
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
2550
- ceph_cap_string(need), ceph_cap_string(file_wanted));
2551
- *err = -EBADF;
2552
- ret = 1;
2736
+ if ((flags & CHECK_FILELOCK) &&
2737
+ (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2738
+ dout("try_get_cap_refs %p error filelock\n", inode);
2739
+ ret = -EIO;
25532740 goto out_unlock;
25542741 }
25552742
....@@ -2570,10 +2757,8 @@
25702757 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
25712758 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
25722759 inode, endoff, ci->i_max_size);
2573
- if (endoff > ci->i_requested_max_size) {
2574
- *err = -EAGAIN;
2575
- ret = 1;
2576
- }
2760
+ if (endoff > ci->i_requested_max_size)
2761
+ ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
25772762 goto out_unlock;
25782763 }
25792764 /*
....@@ -2607,9 +2792,8 @@
26072792 * we can not call down_read() when
26082793 * task isn't in TASK_RUNNING state
26092794 */
2610
- if (nonblock) {
2611
- *err = -EAGAIN;
2612
- ret = 1;
2795
+ if (flags & NON_BLOCKING) {
2796
+ ret = -EAGAIN;
26132797 goto out_unlock;
26142798 }
26152799
....@@ -2620,57 +2804,63 @@
26202804 }
26212805 snap_rwsem_locked = true;
26222806 }
2623
- *got = need | (have & want);
2624
- if ((need & CEPH_CAP_FILE_RD) &&
2807
+ if ((have & want) == want)
2808
+ *got = need | want;
2809
+ else
2810
+ *got = need;
2811
+ if (S_ISREG(inode->i_mode) &&
2812
+ (need & CEPH_CAP_FILE_RD) &&
26252813 !(*got & CEPH_CAP_FILE_CACHE))
26262814 ceph_disable_fscache_readpage(ci);
2627
- __take_cap_refs(ci, *got, true);
2815
+ ceph_take_cap_refs(ci, *got, true);
26282816 ret = 1;
26292817 }
26302818 } else {
26312819 int session_readonly = false;
2632
- if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
2820
+ int mds_wanted;
2821
+ if (ci->i_auth_cap &&
2822
+ (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
26332823 struct ceph_mds_session *s = ci->i_auth_cap->session;
26342824 spin_lock(&s->s_cap_lock);
26352825 session_readonly = s->s_readonly;
26362826 spin_unlock(&s->s_cap_lock);
26372827 }
26382828 if (session_readonly) {
2639
- dout("get_cap_refs %p needed %s but mds%d readonly\n",
2829
+ dout("get_cap_refs %p need %s but mds%d readonly\n",
26402830 inode, ceph_cap_string(need), ci->i_auth_cap->mds);
2641
- *err = -EROFS;
2642
- ret = 1;
2831
+ ret = -EROFS;
26432832 goto out_unlock;
26442833 }
26452834
2646
- if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
2647
- int mds_wanted;
2648
- if (READ_ONCE(mdsc->fsc->mount_state) ==
2649
- CEPH_MOUNT_SHUTDOWN) {
2650
- dout("get_cap_refs %p forced umount\n", inode);
2651
- *err = -EIO;
2652
- ret = 1;
2653
- goto out_unlock;
2654
- }
2655
- mds_wanted = __ceph_caps_mds_wanted(ci, false);
2656
- if (need & ~(mds_wanted & need)) {
2657
- dout("get_cap_refs %p caps were dropped"
2658
- " (session killed?)\n", inode);
2659
- *err = -ESTALE;
2660
- ret = 1;
2661
- goto out_unlock;
2662
- }
2663
- if (!(file_wanted & ~mds_wanted))
2664
- ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
2835
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
2836
+ dout("get_cap_refs %p forced umount\n", inode);
2837
+ ret = -EIO;
2838
+ goto out_unlock;
2839
+ }
2840
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
2841
+ if (need & ~mds_wanted) {
2842
+ dout("get_cap_refs %p need %s > mds_wanted %s\n",
2843
+ inode, ceph_cap_string(need),
2844
+ ceph_cap_string(mds_wanted));
2845
+ ret = -ESTALE;
2846
+ goto out_unlock;
26652847 }
26662848
2667
- dout("get_cap_refs %p have %s needed %s\n", inode,
2849
+ dout("get_cap_refs %p have %s need %s\n", inode,
26682850 ceph_cap_string(have), ceph_cap_string(need));
26692851 }
26702852 out_unlock:
2853
+
2854
+ __ceph_touch_fmode(ci, mdsc, flags);
2855
+
26712856 spin_unlock(&ci->i_ceph_lock);
26722857 if (snap_rwsem_locked)
26732858 up_read(&mdsc->snap_rwsem);
2859
+
2860
+ if (!ret)
2861
+ ceph_update_cap_mis(&mdsc->metric);
2862
+ else if (ret == 1)
2863
+ ceph_update_cap_hit(&mdsc->metric);
26742864
26752865 dout("get_cap_refs %p ret %d got %s\n", inode,
26762866 ret, ceph_cap_string(*got));
....@@ -2705,24 +2895,39 @@
27052895 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
27062896 }
27072897
2708
-int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
2898
+static inline int get_used_fmode(int caps)
27092899 {
2710
- int ret, err = 0;
2900
+ int fmode = 0;
2901
+ if (caps & CEPH_CAP_FILE_RD)
2902
+ fmode |= CEPH_FILE_MODE_RD;
2903
+ if (caps & CEPH_CAP_FILE_WR)
2904
+ fmode |= CEPH_FILE_MODE_WR;
2905
+ return fmode;
2906
+}
2907
+
2908
+int ceph_try_get_caps(struct inode *inode, int need, int want,
2909
+ bool nonblock, int *got)
2910
+{
2911
+ int ret, flags;
27112912
27122913 BUG_ON(need & ~CEPH_CAP_FILE_RD);
2713
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
2714
- ret = ceph_pool_perm_check(ci, need);
2715
- if (ret < 0)
2716
- return ret;
2717
-
2718
- ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
2719
- if (ret) {
2720
- if (err == -EAGAIN) {
2721
- ret = 0;
2722
- } else if (err < 0) {
2723
- ret = err;
2724
- }
2914
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
2915
+ CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2916
+ CEPH_CAP_ANY_DIR_OPS));
2917
+ if (need) {
2918
+ ret = ceph_pool_perm_check(inode, need);
2919
+ if (ret < 0)
2920
+ return ret;
27252921 }
2922
+
2923
+ flags = get_used_fmode(need | want);
2924
+ if (nonblock)
2925
+ flags |= NON_BLOCKING;
2926
+
2927
+ ret = try_get_cap_refs(inode, need, want, 0, flags, got);
2928
+ /* three special error codes */
2929
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE)
2930
+ ret = 0;
27262931 return ret;
27272932 }
27282933
....@@ -2731,34 +2936,54 @@
27312936 * due to a small max_size, make sure we check_max_size (and possibly
27322937 * ask the mds) so we don't get hung up indefinitely.
27332938 */
2734
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2939
+int ceph_get_caps(struct file *filp, int need, int want,
27352940 loff_t endoff, int *got, struct page **pinned_page)
27362941 {
2737
- int _got, ret, err = 0;
2942
+ struct ceph_file_info *fi = filp->private_data;
2943
+ struct inode *inode = file_inode(filp);
2944
+ struct ceph_inode_info *ci = ceph_inode(inode);
2945
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2946
+ int ret, _got, flags;
27382947
2739
- ret = ceph_pool_perm_check(ci, need);
2948
+ ret = ceph_pool_perm_check(inode, need);
27402949 if (ret < 0)
27412950 return ret;
27422951
2743
- while (true) {
2744
- if (endoff > 0)
2745
- check_max_size(&ci->vfs_inode, endoff);
2952
+ if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2953
+ fi->filp_gen != READ_ONCE(fsc->filp_gen))
2954
+ return -EBADF;
27462955
2747
- err = 0;
2956
+ flags = get_used_fmode(need | want);
2957
+
2958
+ while (true) {
2959
+ flags &= CEPH_FILE_MODE_MASK;
2960
+ if (atomic_read(&fi->num_locks))
2961
+ flags |= CHECK_FILELOCK;
27482962 _got = 0;
2749
- ret = try_get_cap_refs(ci, need, want, endoff,
2750
- false, &_got, &err);
2751
- if (ret) {
2752
- if (err == -EAGAIN)
2753
- continue;
2754
- if (err < 0)
2755
- ret = err;
2756
- } else {
2963
+ ret = try_get_cap_refs(inode, need, want, endoff,
2964
+ flags, &_got);
2965
+ WARN_ON_ONCE(ret == -EAGAIN);
2966
+ if (!ret) {
2967
+ struct ceph_mds_client *mdsc = fsc->mdsc;
2968
+ struct cap_wait cw;
27572969 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2970
+
2971
+ cw.ino = ceph_ino(inode);
2972
+ cw.tgid = current->tgid;
2973
+ cw.need = need;
2974
+ cw.want = want;
2975
+
2976
+ spin_lock(&mdsc->caps_list_lock);
2977
+ list_add(&cw.list, &mdsc->cap_wait_list);
2978
+ spin_unlock(&mdsc->caps_list_lock);
2979
+
2980
+ /* make sure used fmode not timeout */
2981
+ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
27582982 add_wait_queue(&ci->i_cap_wq, &wait);
27592983
2760
- while (!try_get_cap_refs(ci, need, want, endoff,
2761
- true, &_got, &err)) {
2984
+ flags |= NON_BLOCKING;
2985
+ while (!(ret = try_get_cap_refs(inode, need, want,
2986
+ endoff, flags, &_got))) {
27622987 if (signal_pending(current)) {
27632988 ret = -ERESTARTSYS;
27642989 break;
....@@ -2767,27 +2992,48 @@
27672992 }
27682993
27692994 remove_wait_queue(&ci->i_cap_wq, &wait);
2995
+ ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
27702996
2771
- if (err == -EAGAIN)
2997
+ spin_lock(&mdsc->caps_list_lock);
2998
+ list_del(&cw.list);
2999
+ spin_unlock(&mdsc->caps_list_lock);
3000
+
3001
+ if (ret == -EAGAIN)
27723002 continue;
2773
- if (err < 0)
2774
- ret = err;
27753003 }
3004
+
3005
+ if ((fi->fmode & CEPH_FILE_MODE_WR) &&
3006
+ fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
3007
+ if (ret >= 0 && _got)
3008
+ ceph_put_cap_refs(ci, _got);
3009
+ return -EBADF;
3010
+ }
3011
+
27763012 if (ret < 0) {
2777
- if (err == -ESTALE) {
3013
+ if (ret == -EFBIG || ret == -ESTALE) {
3014
+ int ret2 = ceph_wait_on_async_create(inode);
3015
+ if (ret2 < 0)
3016
+ return ret2;
3017
+ }
3018
+ if (ret == -EFBIG) {
3019
+ check_max_size(inode, endoff);
3020
+ continue;
3021
+ }
3022
+ if (ret == -ESTALE) {
27783023 /* session was killed, try renew caps */
2779
- ret = ceph_renew_caps(&ci->vfs_inode);
3024
+ ret = ceph_renew_caps(inode, flags);
27803025 if (ret == 0)
27813026 continue;
27823027 }
27833028 return ret;
27843029 }
27853030
2786
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
3031
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
3032
+ ci->i_inline_version != CEPH_INLINE_NONE &&
27873033 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
2788
- i_size_read(&ci->vfs_inode) > 0) {
3034
+ i_size_read(inode) > 0) {
27893035 struct page *page =
2790
- find_get_page(ci->vfs_inode.i_mapping, 0);
3036
+ find_get_page(inode->i_mapping, 0);
27913037 if (page) {
27923038 if (PageUptodate(page)) {
27933039 *pinned_page = page;
....@@ -2806,7 +3052,7 @@
28063052 * getattr request will bring inline data into
28073053 * page cache
28083054 */
2809
- ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
3055
+ ret = __ceph_do_getattr(inode, NULL,
28103056 CEPH_STAT_CAP_INLINE_DATA,
28113057 true);
28123058 if (ret < 0)
....@@ -2816,7 +3062,8 @@
28163062 break;
28173063 }
28183064
2819
- if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
3065
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
3066
+ (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
28203067 ceph_fscache_revalidate_cookie(ci);
28213068
28223069 *got = _got;
....@@ -2830,7 +3077,7 @@
28303077 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
28313078 {
28323079 spin_lock(&ci->i_ceph_lock);
2833
- __take_cap_refs(ci, caps, false);
3080
+ ceph_take_cap_refs(ci, caps, false);
28343081 spin_unlock(&ci->i_ceph_lock);
28353082 }
28363083
....@@ -2867,7 +3114,8 @@
28673114 * If we are releasing a WR cap (from a sync write), finalize any affected
28683115 * cap_snap, and wake up any waiters.
28693116 */
2870
-void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
3117
+static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
3118
+ bool skip_checking_caps)
28713119 {
28723120 struct inode *inode = &ci->vfs_inode;
28733121 int last = 0, put = 0, flushsnaps = 0, wake = 0;
....@@ -2880,6 +3128,9 @@
28803128 last++;
28813129 if (had & CEPH_CAP_FILE_CACHE)
28823130 if (--ci->i_rdcache_ref == 0)
3131
+ last++;
3132
+ if (had & CEPH_CAP_FILE_EXCL)
3133
+ if (--ci->i_fx_ref == 0)
28833134 last++;
28843135 if (had & CEPH_CAP_FILE_BUFFER) {
28853136 if (--ci->i_wb_ref == 0) {
....@@ -2912,7 +3163,7 @@
29123163 ci->i_head_snapc = NULL;
29133164 }
29143165 /* see comment in __ceph_remove_cap() */
2915
- if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
3166
+ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
29163167 drop_inode_snap_realm(ci);
29173168 }
29183169 spin_unlock(&ci->i_ceph_lock);
....@@ -2920,14 +3171,26 @@
29203171 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
29213172 last ? " last" : "", put ? " put" : "");
29223173
2923
- if (last && !flushsnaps)
2924
- ceph_check_caps(ci, 0, NULL);
2925
- else if (flushsnaps)
2926
- ceph_flush_snaps(ci, NULL);
3174
+ if (!skip_checking_caps) {
3175
+ if (last)
3176
+ ceph_check_caps(ci, 0, NULL);
3177
+ else if (flushsnaps)
3178
+ ceph_flush_snaps(ci, NULL);
3179
+ }
29273180 if (wake)
29283181 wake_up_all(&ci->i_cap_wq);
29293182 while (put-- > 0)
29303183 iput(inode);
3184
+}
3185
+
3186
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
3187
+{
3188
+ __ceph_put_cap_refs(ci, had, false);
3189
+}
3190
+
3191
+void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
3192
+{
3193
+ __ceph_put_cap_refs(ci, had, true);
29313194 }
29323195
29333196 /*
....@@ -2977,7 +3240,16 @@
29773240 break;
29783241 }
29793242 }
2980
- BUG_ON(!found);
3243
+
3244
+ if (!found) {
3245
+ /*
3246
+ * The capsnap should already be removed when removing
3247
+ * auth cap in the case of a forced unmount.
3248
+ */
3249
+ WARN_ON_ONCE(ci->i_auth_cap);
3250
+ goto unlock;
3251
+ }
3252
+
29813253 capsnap->dirty_pages -= nr;
29823254 if (capsnap->dirty_pages == 0) {
29833255 complete_capsnap = true;
....@@ -2999,17 +3271,20 @@
29993271 complete_capsnap ? " (complete capsnap)" : "");
30003272 }
30013273
3274
+unlock:
30023275 spin_unlock(&ci->i_ceph_lock);
30033276
30043277 if (last) {
3005
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
3278
+ ceph_check_caps(ci, 0, NULL);
30063279 } else if (flush_snaps) {
30073280 ceph_flush_snaps(ci, NULL);
30083281 }
30093282 if (complete_capsnap)
30103283 wake_up_all(&ci->i_cap_wq);
3011
- while (put-- > 0)
3012
- iput(inode);
3284
+ while (put-- > 0) {
3285
+ /* avoid calling iput_final() in osd dispatch threads */
3286
+ ceph_async_iput(inode);
3287
+ }
30133288 }
30143289
30153290 /*
....@@ -3054,8 +3329,10 @@
30543329 bool dirstat_valid;
30553330 u64 nfiles;
30563331 u64 nsubdirs;
3332
+ u64 change_attr;
30573333 /* currently issued */
30583334 int issued;
3335
+ struct timespec64 btime;
30593336 };
30603337
30613338 /*
....@@ -3079,7 +3356,8 @@
30793356 int used, wanted, dirty;
30803357 u64 size = le64_to_cpu(grant->size);
30813358 u64 max_size = le64_to_cpu(grant->max_size);
3082
- int check_caps = 0;
3359
+ unsigned char check_caps = 0;
3360
+ bool was_stale = cap->cap_gen < session->s_cap_gen;
30833361 bool wake = false;
30843362 bool writeback = false;
30853363 bool queue_trunc = false;
....@@ -3092,6 +3370,28 @@
30923370 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
30933371 inode->i_size);
30943372
3373
+
3374
+ /*
3375
+ * If CACHE is being revoked, and we have no dirty buffers,
3376
+ * try to invalidate (once). (If there are dirty buffers, we
3377
+ * will invalidate _after_ writeback.)
3378
+ */
3379
+ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
3380
+ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3381
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
3382
+ !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
3383
+ if (try_nonblocking_invalidate(inode)) {
3384
+ /* there were locked pages.. invalidate later
3385
+ in a separate thread. */
3386
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3387
+ queue_invalidate = true;
3388
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
3389
+ }
3390
+ }
3391
+ }
3392
+
3393
+ if (was_stale)
3394
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
30953395
30963396 /*
30973397 * auth mds of the inode changed. we received the cap export message,
....@@ -3108,36 +3408,20 @@
31083408 newcaps |= cap->issued;
31093409 }
31103410
3111
- /*
3112
- * If CACHE is being revoked, and we have no dirty buffers,
3113
- * try to invalidate (once). (If there are dirty buffers, we
3114
- * will invalidate _after_ writeback.)
3115
- */
3116
- if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
3117
- ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3118
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
3119
- !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
3120
- if (try_nonblocking_invalidate(inode)) {
3121
- /* there were locked pages.. invalidate later
3122
- in a separate thread. */
3123
- if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3124
- queue_invalidate = true;
3125
- ci->i_rdcache_revoking = ci->i_rdcache_gen;
3126
- }
3127
- }
3128
- }
3129
-
31303411 /* side effects now are allowed */
31313412 cap->cap_gen = session->s_cap_gen;
31323413 cap->seq = seq;
31333414
31343415 __check_cap_issue(ci, cap, newcaps);
31353416
3417
+ inode_set_max_iversion_raw(inode, extra_info->change_attr);
3418
+
31363419 if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
31373420 (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
31383421 inode->i_mode = le32_to_cpu(grant->mode);
31393422 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
31403423 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
3424
+ ci->i_btime = extra_info->btime;
31413425 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
31423426 from_kuid(&init_user_ns, inode->i_uid),
31433427 from_kgid(&init_user_ns, inode->i_gid));
....@@ -3164,6 +3448,7 @@
31643448 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
31653449 ci->i_xattrs.version = version;
31663450 ceph_forget_all_cached_acls(inode);
3451
+ ceph_security_invalidate_secctx(inode);
31673452 }
31683453 }
31693454
....@@ -3216,10 +3501,6 @@
32163501 ci->i_requested_max_size = 0;
32173502 }
32183503 wake = true;
3219
- } else if (ci->i_wanted_max_size > ci->i_max_size &&
3220
- ci->i_wanted_max_size > ci->i_requested_max_size) {
3221
- /* CEPH_CAP_OP_IMPORT */
3222
- wake = true;
32233504 }
32243505 }
32253506
....@@ -3231,13 +3512,20 @@
32313512 ceph_cap_string(wanted),
32323513 ceph_cap_string(used),
32333514 ceph_cap_string(dirty));
3234
- if (wanted != le32_to_cpu(grant->wanted)) {
3235
- dout("mds wanted %s -> %s\n",
3236
- ceph_cap_string(le32_to_cpu(grant->wanted)),
3237
- ceph_cap_string(wanted));
3238
- /* imported cap may not have correct mds_wanted */
3239
- if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
3240
- check_caps = 1;
3515
+
3516
+ if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3517
+ (wanted & ~(cap->mds_wanted | newcaps))) {
3518
+ /*
3519
+ * If mds is importing cap, prior cap messages that update
3520
+ * 'wanted' may get dropped by mds (migrate seq mismatch).
3521
+ *
3522
+ * We don't send cap message to update 'wanted' if what we
3523
+ * want are already issued. If mds revokes caps, cap message
3524
+ * that releases caps also tells mds what we want. But if
3525
+ * caps got revoked by mds forcedly (session stale). We may
3526
+ * haven't told mds what we want.
3527
+ */
3528
+ check_caps = 1;
32413529 }
32423530
32433531 /* revocation, grant, or no-op? */
....@@ -3248,11 +3536,12 @@
32483536 ceph_cap_string(cap->issued),
32493537 ceph_cap_string(newcaps),
32503538 ceph_cap_string(revoking));
3251
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
3539
+ if (S_ISREG(inode->i_mode) &&
3540
+ (revoking & used & CEPH_CAP_FILE_BUFFER))
32523541 writeback = true; /* initiate writeback; will delay ack */
3253
- else if (revoking == CEPH_CAP_FILE_CACHE &&
3254
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
3255
- queue_invalidate)
3542
+ else if (queue_invalidate &&
3543
+ revoking == CEPH_CAP_FILE_CACHE &&
3544
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
32563545 ; /* do nothing yet, invalidation will be queued */
32573546 else if (cap == ci->i_auth_cap)
32583547 check_caps = 1; /* check auth cap only */
....@@ -3288,13 +3577,22 @@
32883577 }
32893578
32903579 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
3291
- if (newcaps & ~extra_info->issued)
3292
- wake = true;
3293
- kick_flushing_inode_caps(session->s_mdsc, session, inode);
3580
+ if (ci->i_auth_cap == cap) {
3581
+ if (newcaps & ~extra_info->issued)
3582
+ wake = true;
3583
+
3584
+ if (ci->i_requested_max_size > max_size ||
3585
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
3586
+ /* re-request max_size if necessary */
3587
+ ci->i_requested_max_size = 0;
3588
+ wake = true;
3589
+ }
3590
+
3591
+ ceph_kick_flushing_inode_caps(session, ci);
3592
+ }
32943593 up_read(&session->s_mdsc->snap_rwsem);
3295
- } else {
3296
- spin_unlock(&ci->i_ceph_lock);
32973594 }
3595
+ spin_unlock(&ci->i_ceph_lock);
32983596
32993597 if (fill_inline)
33003598 ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
....@@ -3318,10 +3616,10 @@
33183616 wake_up_all(&ci->i_cap_wq);
33193617
33203618 if (check_caps == 1)
3321
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
3619
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
33223620 session);
33233621 else if (check_caps == 2)
3324
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
3622
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
33253623 else
33263624 mutex_unlock(&session->s_mutex);
33273625 }
....@@ -3348,15 +3646,26 @@
33483646 bool wake_mdsc = false;
33493647
33503648 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
3649
+ /* Is this the one that was flushed? */
33513650 if (cf->tid == flush_tid)
33523651 cleaned = cf->caps;
3353
- if (cf->caps == 0) /* capsnap */
3652
+
3653
+ /* Is this a capsnap? */
3654
+ if (cf->is_capsnap)
33543655 continue;
3656
+
33553657 if (cf->tid <= flush_tid) {
3356
- if (__finish_cap_flush(NULL, ci, cf))
3357
- wake_ci = true;
3658
+ /*
3659
+ * An earlier or current tid. The FLUSH_ACK should
3660
+ * represent a superset of this flush's caps.
3661
+ */
3662
+ wake_ci |= __detach_cap_flush_from_ci(ci, cf);
33583663 list_add_tail(&cf->i_list, &to_remove);
33593664 } else {
3665
+ /*
3666
+ * This is a later one. Any caps in it are still dirty
3667
+ * so don't count them as cleaned.
3668
+ */
33603669 cleaned &= ~cf->caps;
33613670 if (!cleaned)
33623671 break;
....@@ -3376,10 +3685,8 @@
33763685
33773686 spin_lock(&mdsc->cap_dirty_lock);
33783687
3379
- list_for_each_entry(cf, &to_remove, i_list) {
3380
- if (__finish_cap_flush(mdsc, NULL, cf))
3381
- wake_mdsc = true;
3382
- }
3688
+ list_for_each_entry(cf, &to_remove, i_list)
3689
+ wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
33833690
33843691 if (ci->i_flushing_caps == 0) {
33853692 if (list_empty(&ci->i_cap_flush_list)) {
....@@ -3417,8 +3724,9 @@
34173724 while (!list_empty(&to_remove)) {
34183725 cf = list_first_entry(&to_remove,
34193726 struct ceph_cap_flush, i_list);
3420
- list_del(&cf->i_list);
3421
- ceph_free_cap_flush(cf);
3727
+ list_del_init(&cf->i_list);
3728
+ if (!cf->is_capsnap)
3729
+ ceph_free_cap_flush(cf);
34223730 }
34233731
34243732 if (wake_ci)
....@@ -3427,6 +3735,43 @@
34273735 wake_up_all(&mdsc->cap_flushing_wq);
34283736 if (drop)
34293737 iput(inode);
3738
+}
3739
+
3740
+void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3741
+ bool *wake_ci, bool *wake_mdsc)
3742
+{
3743
+ struct ceph_inode_info *ci = ceph_inode(inode);
3744
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
3745
+ bool ret;
3746
+
3747
+ lockdep_assert_held(&ci->i_ceph_lock);
3748
+
3749
+ dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
3750
+
3751
+ list_del_init(&capsnap->ci_item);
3752
+ ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
3753
+ if (wake_ci)
3754
+ *wake_ci = ret;
3755
+
3756
+ spin_lock(&mdsc->cap_dirty_lock);
3757
+ if (list_empty(&ci->i_cap_flush_list))
3758
+ list_del_init(&ci->i_flushing_item);
3759
+
3760
+ ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
3761
+ if (wake_mdsc)
3762
+ *wake_mdsc = ret;
3763
+ spin_unlock(&mdsc->cap_dirty_lock);
3764
+}
3765
+
3766
+void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
3767
+ bool *wake_ci, bool *wake_mdsc)
3768
+{
3769
+ struct ceph_inode_info *ci = ceph_inode(inode);
3770
+
3771
+ lockdep_assert_held(&ci->i_ceph_lock);
3772
+
3773
+ WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
3774
+ __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
34303775 }
34313776
34323777 /*
....@@ -3466,25 +3811,10 @@
34663811 capsnap, capsnap->follows);
34673812 }
34683813 }
3469
- if (flushed) {
3470
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
3471
- dout(" removing %p cap_snap %p follows %lld\n",
3472
- inode, capsnap, follows);
3473
- list_del(&capsnap->ci_item);
3474
- if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
3475
- wake_ci = true;
3476
-
3477
- spin_lock(&mdsc->cap_dirty_lock);
3478
-
3479
- if (list_empty(&ci->i_cap_flush_list))
3480
- list_del_init(&ci->i_flushing_item);
3481
-
3482
- if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
3483
- wake_mdsc = true;
3484
-
3485
- spin_unlock(&mdsc->cap_dirty_lock);
3486
- }
3814
+ if (flushed)
3815
+ ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
34873816 spin_unlock(&ci->i_ceph_lock);
3817
+
34883818 if (flushed) {
34893819 ceph_put_snap_context(capsnap->context);
34903820 ceph_put_cap_snap(capsnap);
....@@ -3501,10 +3831,9 @@
35013831 *
35023832 * caller hold s_mutex.
35033833 */
3504
-static void handle_cap_trunc(struct inode *inode,
3834
+static bool handle_cap_trunc(struct inode *inode,
35053835 struct ceph_mds_caps *trunc,
35063836 struct ceph_mds_session *session)
3507
- __releases(ci->i_ceph_lock)
35083837 {
35093838 struct ceph_inode_info *ci = ceph_inode(inode);
35103839 int mds = session->s_mds;
....@@ -3515,7 +3844,9 @@
35153844 int implemented = 0;
35163845 int dirty = __ceph_caps_dirty(ci);
35173846 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
3518
- int queue_trunc = 0;
3847
+ bool queue_trunc = false;
3848
+
3849
+ lockdep_assert_held(&ci->i_ceph_lock);
35193850
35203851 issued |= implemented | dirty;
35213852
....@@ -3523,10 +3854,7 @@
35233854 inode, mds, seq, truncate_size, truncate_seq);
35243855 queue_trunc = ceph_fill_file_size(inode, issued,
35253856 truncate_seq, truncate_size, size);
3526
- spin_unlock(&ci->i_ceph_lock);
3527
-
3528
- if (queue_trunc)
3529
- ceph_queue_vmtruncate(inode);
3857
+ return queue_trunc;
35303858 }
35313859
35323860 /*
....@@ -3571,8 +3899,6 @@
35713899
35723900 if (target < 0) {
35733901 __ceph_remove_cap(cap, false);
3574
- if (!ci->i_auth_cap)
3575
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
35763902 goto out_unlock;
35773903 }
35783904
....@@ -3602,15 +3928,9 @@
36023928 tcap->issue_seq = t_seq - 1;
36033929 tcap->issued |= issued;
36043930 tcap->implemented |= issued;
3605
- if (cap == ci->i_auth_cap)
3931
+ if (cap == ci->i_auth_cap) {
36063932 ci->i_auth_cap = tcap;
3607
-
3608
- if (!list_empty(&ci->i_cap_flush_list) &&
3609
- ci->i_auth_cap == tcap) {
3610
- spin_lock(&mdsc->cap_dirty_lock);
3611
- list_move_tail(&ci->i_flushing_item,
3612
- &tcap->session->s_cap_flushing);
3613
- spin_unlock(&mdsc->cap_dirty_lock);
3933
+ change_auth_cap_ses(ci, tcap->session);
36143934 }
36153935 }
36163936 __ceph_remove_cap(cap, false);
....@@ -3619,7 +3939,7 @@
36193939 /* add placeholder for the export tagert */
36203940 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
36213941 tcap = new_cap;
3622
- ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
3942
+ ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
36233943 t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
36243944
36253945 if (!list_empty(&ci->i_cap_flush_list) &&
....@@ -3679,7 +3999,6 @@
36793999 struct ceph_mds_cap_peer *ph,
36804000 struct ceph_mds_session *session,
36814001 struct ceph_cap **target_cap, int *old_issued)
3682
- __acquires(ci->i_ceph_lock)
36834002 {
36844003 struct ceph_inode_info *ci = ceph_inode(inode);
36854004 struct ceph_cap *cap, *ocap, *new_cap = NULL;
....@@ -3704,14 +4023,13 @@
37044023
37054024 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
37064025 inode, ci, mds, mseq, peer);
3707
-
37084026 retry:
3709
- spin_lock(&ci->i_ceph_lock);
37104027 cap = __get_cap_for_mds(ci, mds);
37114028 if (!cap) {
37124029 if (!new_cap) {
37134030 spin_unlock(&ci->i_ceph_lock);
37144031 new_cap = ceph_get_cap(mdsc, NULL);
4032
+ spin_lock(&ci->i_ceph_lock);
37154033 goto retry;
37164034 }
37174035 cap = new_cap;
....@@ -3725,7 +4043,7 @@
37254043 __ceph_caps_issued(ci, &issued);
37264044 issued |= __ceph_caps_dirty(ci);
37274045
3728
- ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
4046
+ ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
37294047 realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
37304048
37314049 ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
....@@ -3745,9 +4063,6 @@
37454063 }
37464064 __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
37474065 }
3748
-
3749
- /* make sure we re-request max_size, if necessary */
3750
- ci->i_requested_max_size = 0;
37514066
37524067 *old_issued = issued;
37534068 *target_cap = cap;
....@@ -3777,6 +4092,7 @@
37774092 size_t snaptrace_len;
37784093 void *p, *end;
37794094 struct cap_extra_info extra_info = {};
4095
+ bool queue_trunc;
37804096
37814097 dout("handle_caps from mds%d\n", session->s_mds);
37824098
....@@ -3852,17 +4168,19 @@
38524168 }
38534169 }
38544170
3855
- if (msg_version >= 11) {
4171
+ if (msg_version >= 9) {
38564172 struct ceph_timespec *btime;
3857
- u64 change_attr;
3858
- u32 flags;
38594173
3860
- /* version >= 9 */
38614174 if (p + sizeof(*btime) > end)
38624175 goto bad;
38634176 btime = p;
4177
+ ceph_decode_timespec64(&extra_info.btime, btime);
38644178 p += sizeof(*btime);
3865
- ceph_decode_64_safe(&p, end, change_attr, bad);
4179
+ ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
4180
+ }
4181
+
4182
+ if (msg_version >= 11) {
4183
+ u32 flags;
38664184 /* version >= 10 */
38674185 ceph_decode_32_safe(&p, end, flags, bad);
38684186 /* version >= 11 */
....@@ -3878,7 +4196,7 @@
38784196 vino.snap, inode);
38794197
38804198 mutex_lock(&session->s_mutex);
3881
- session->s_seq++;
4199
+ inc_session_sequence(session);
38824200 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
38834201 (unsigned)seq);
38844202
....@@ -3894,9 +4212,7 @@
38944212 cap->seq = seq;
38954213 cap->issue_seq = seq;
38964214 spin_lock(&session->s_cap_lock);
3897
- list_add_tail(&cap->session_caps,
3898
- &session->s_cap_releases);
3899
- session->s_num_cap_releases++;
4215
+ __ceph_queue_cap_release(session, cap);
39004216 spin_unlock(&session->s_cap_lock);
39014217 }
39024218 goto flush_cap_releases;
....@@ -3924,6 +4240,7 @@
39244240 } else {
39254241 down_read(&mdsc->snap_rwsem);
39264242 }
4243
+ spin_lock(&ci->i_ceph_lock);
39274244 handle_cap_import(mdsc, inode, h, peer, session,
39284245 &cap, &extra_info.issued);
39294246 handle_cap_grant(inode, session, cap,
....@@ -3960,7 +4277,10 @@
39604277 break;
39614278
39624279 case CEPH_CAP_OP_TRUNC:
3963
- handle_cap_trunc(inode, h, session);
4280
+ queue_trunc = handle_cap_trunc(inode, h, session);
4281
+ spin_unlock(&ci->i_ceph_lock);
4282
+ if (queue_trunc)
4283
+ ceph_queue_vmtruncate(inode);
39644284 break;
39654285
39664286 default:
....@@ -3969,7 +4289,13 @@
39694289 ceph_cap_op_name(op));
39704290 }
39714291
3972
- goto done;
4292
+done:
4293
+ mutex_unlock(&session->s_mutex);
4294
+done_unlocked:
4295
+ ceph_put_string(extra_info.pool_ns);
4296
+ /* avoid calling iput_final() in mds dispatch threads */
4297
+ ceph_async_iput(inode);
4298
+ return;
39734299
39744300 flush_cap_releases:
39754301 /*
....@@ -3977,14 +4303,8 @@
39774303 * along for the mds (who clearly thinks we still have this
39784304 * cap).
39794305 */
3980
- ceph_send_cap_releases(mdsc, session);
3981
-
3982
-done:
3983
- mutex_unlock(&session->s_mutex);
3984
-done_unlocked:
3985
- iput(inode);
3986
- ceph_put_string(extra_info.pool_ns);
3987
- return;
4306
+ ceph_flush_cap_releases(mdsc, session);
4307
+ goto done;
39884308
39894309 bad:
39904310 pr_err("ceph_handle_caps: corrupt message\n");
....@@ -3994,56 +4314,70 @@
39944314
39954315 /*
39964316 * Delayed work handler to process end of delayed cap release LRU list.
4317
+ *
4318
+ * If new caps are added to the list while processing it, these won't get
4319
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
4320
+ * returned so that the work can be scheduled accordingly.
39974321 */
3998
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4322
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
39994323 {
40004324 struct inode *inode;
40014325 struct ceph_inode_info *ci;
4002
- int flags = CHECK_CAPS_NODELAY;
4326
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
4327
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
4328
+ unsigned long loop_start = jiffies;
4329
+ unsigned long delay = 0;
40034330
40044331 dout("check_delayed_caps\n");
4005
- while (1) {
4006
- spin_lock(&mdsc->cap_delay_lock);
4007
- if (list_empty(&mdsc->cap_delay_list))
4008
- break;
4332
+ spin_lock(&mdsc->cap_delay_lock);
4333
+ while (!list_empty(&mdsc->cap_delay_list)) {
40094334 ci = list_first_entry(&mdsc->cap_delay_list,
40104335 struct ceph_inode_info,
40114336 i_cap_delay_list);
4337
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
4338
+ dout("%s caps added recently. Exiting loop", __func__);
4339
+ delay = ci->i_hold_caps_max;
4340
+ break;
4341
+ }
40124342 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
40134343 time_before(jiffies, ci->i_hold_caps_max))
40144344 break;
40154345 list_del_init(&ci->i_cap_delay_list);
40164346
40174347 inode = igrab(&ci->vfs_inode);
4018
- spin_unlock(&mdsc->cap_delay_lock);
4019
-
40204348 if (inode) {
4349
+ spin_unlock(&mdsc->cap_delay_lock);
40214350 dout("check_delayed_caps on %p\n", inode);
4022
- ceph_check_caps(ci, flags, NULL);
4023
- iput(inode);
4351
+ ceph_check_caps(ci, 0, NULL);
4352
+ /* avoid calling iput_final() in tick thread */
4353
+ ceph_async_iput(inode);
4354
+ spin_lock(&mdsc->cap_delay_lock);
40244355 }
40254356 }
40264357 spin_unlock(&mdsc->cap_delay_lock);
4358
+
4359
+ return delay;
40274360 }
40284361
40294362 /*
40304363 * Flush all dirty caps to the mds
40314364 */
4032
-void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
4365
+static void flush_dirty_session_caps(struct ceph_mds_session *s)
40334366 {
4367
+ struct ceph_mds_client *mdsc = s->s_mdsc;
40344368 struct ceph_inode_info *ci;
40354369 struct inode *inode;
40364370
40374371 dout("flush_dirty_caps\n");
40384372 spin_lock(&mdsc->cap_dirty_lock);
4039
- while (!list_empty(&mdsc->cap_dirty)) {
4040
- ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
4373
+ while (!list_empty(&s->s_cap_dirty)) {
4374
+ ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
40414375 i_dirty_item);
40424376 inode = &ci->vfs_inode;
40434377 ihold(inode);
40444378 dout("flush_dirty_caps %p\n", inode);
40454379 spin_unlock(&mdsc->cap_dirty_lock);
4046
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
4380
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
40474381 iput(inode);
40484382 spin_lock(&mdsc->cap_dirty_lock);
40494383 }
....@@ -4051,14 +4385,53 @@
40514385 dout("flush_dirty_caps done\n");
40524386 }
40534387
4054
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
4388
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
40554389 {
4056
- int i;
4390
+ ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
4391
+}
4392
+
4393
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
4394
+ struct ceph_mds_client *mdsc, int fmode)
4395
+{
4396
+ unsigned long now = jiffies;
4397
+ if (fmode & CEPH_FILE_MODE_RD)
4398
+ ci->i_last_rd = now;
4399
+ if (fmode & CEPH_FILE_MODE_WR)
4400
+ ci->i_last_wr = now;
4401
+ /* queue periodic check */
4402
+ if (fmode &&
4403
+ __ceph_is_any_real_caps(ci) &&
4404
+ list_empty(&ci->i_cap_delay_list))
4405
+ __cap_delay_requeue(mdsc, ci);
4406
+}
4407
+
4408
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
4409
+{
4410
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
40574411 int bits = (fmode << 1) | 1;
4412
+ bool already_opened = false;
4413
+ int i;
4414
+
4415
+ if (count == 1)
4416
+ atomic64_inc(&mdsc->metric.opened_files);
4417
+
4418
+ spin_lock(&ci->i_ceph_lock);
40584419 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4420
+ /*
4421
+ * If any of the mode ref is larger than 0,
4422
+ * that means it has been already opened by
4423
+ * others. Just skip checking the PIN ref.
4424
+ */
4425
+ if (i && ci->i_nr_by_mode[i])
4426
+ already_opened = true;
4427
+
40594428 if (bits & (1 << i))
4060
- ci->i_nr_by_mode[i]++;
4429
+ ci->i_nr_by_mode[i] += count;
40614430 }
4431
+
4432
+ if (!already_opened)
4433
+ percpu_counter_inc(&mdsc->metric.opened_inodes);
4434
+ spin_unlock(&ci->i_ceph_lock);
40624435 }
40634436
40644437 /*
....@@ -4066,30 +4439,39 @@
40664439 * we may need to release capabilities to the MDS (or schedule
40674440 * their delayed release).
40684441 */
4069
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
4442
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
40704443 {
4071
- int i, last = 0;
4444
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
40724445 int bits = (fmode << 1) | 1;
4446
+ bool is_closed = true;
4447
+ int i;
4448
+
4449
+ if (count == 1)
4450
+ atomic64_dec(&mdsc->metric.opened_files);
4451
+
40734452 spin_lock(&ci->i_ceph_lock);
40744453 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
40754454 if (bits & (1 << i)) {
4076
- BUG_ON(ci->i_nr_by_mode[i] == 0);
4077
- if (--ci->i_nr_by_mode[i] == 0)
4078
- last++;
4455
+ BUG_ON(ci->i_nr_by_mode[i] < count);
4456
+ ci->i_nr_by_mode[i] -= count;
40794457 }
4080
- }
4081
- dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
4082
- &ci->vfs_inode, fmode,
4083
- ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
4084
- ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
4085
- spin_unlock(&ci->i_ceph_lock);
40864458
4087
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
4088
- ceph_check_caps(ci, 0, NULL);
4459
+ /*
4460
+ * If any of the mode ref is not 0 after
4461
+ * decreased, that means it is still opened
4462
+ * by others. Just skip checking the PIN ref.
4463
+ */
4464
+ if (i && ci->i_nr_by_mode[i])
4465
+ is_closed = false;
4466
+ }
4467
+
4468
+ if (is_closed)
4469
+ percpu_counter_dec(&mdsc->metric.opened_inodes);
4470
+ spin_unlock(&ci->i_ceph_lock);
40894471 }
40904472
40914473 /*
4092
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
4474
+ * For a soon-to-be unlinked file, drop the LINK caps. If it
40934475 * looks like the link count will hit 0, drop any other caps (other
40944476 * than PIN) we don't specifically want (due to the file still being
40954477 * open).
....@@ -4103,7 +4485,6 @@
41034485 if (inode->i_nlink == 1) {
41044486 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
41054487
4106
- ci->i_ceph_flags |= CEPH_I_NODELAY;
41074488 if (__ceph_caps_dirty(ci)) {
41084489 struct ceph_mds_client *mdsc =
41094490 ceph_inode_to_client(inode)->mdsc;
....@@ -4159,8 +4540,6 @@
41594540 if (force || (cap->issued & drop)) {
41604541 if (cap->issued & drop) {
41614542 int wanted = __ceph_caps_wanted(ci);
4162
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
4163
- wanted |= cap->mds_wanted;
41644543 dout("encode_inode_release %p cap %p "
41654544 "%s -> %s, wanted %s -> %s\n", inode, cap,
41664545 ceph_cap_string(cap->issued),
....@@ -4171,6 +4550,9 @@
41714550 cap->issued &= ~drop;
41724551 cap->implemented &= ~drop;
41734552 cap->mds_wanted = wanted;
4553
+ if (cap == ci->i_auth_cap &&
4554
+ !(wanted & CEPH_CAP_ANY_FILE_WR))
4555
+ ci->i_requested_max_size = 0;
41744556 } else {
41754557 dout("encode_inode_release %p cap %p %s"
41764558 " (force)\n", inode, cap,