forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/fs/ceph/snap.c
....@@ -3,11 +3,13 @@
33
44 #include <linux/sort.h>
55 #include <linux/slab.h>
6
-
6
+#include <linux/iversion.h>
77 #include "super.h"
88 #include "mds_client.h"
9
-
109 #include <linux/ceph/decode.h>
10
+
11
+/* unused map expires after 5 minutes */
12
+#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
1113
1214 /*
1315 * Snapshots in ceph are driven in large part by cooperation from the
....@@ -58,24 +60,26 @@
5860 /*
5961 * increase ref count for the realm
6062 *
61
- * caller must hold snap_rwsem for write.
63
+ * caller must hold snap_rwsem.
6264 */
6365 void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
6466 struct ceph_snap_realm *realm)
6567 {
66
- dout("get_realm %p %d -> %d\n", realm,
67
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
68
+ lockdep_assert_held(&mdsc->snap_rwsem);
69
+
6870 /*
69
- * since we _only_ increment realm refs or empty the empty
70
- * list with snap_rwsem held, adjusting the empty list here is
71
- * safe. we do need to protect against concurrent empty list
72
- * additions, however.
71
+ * The 0->1 and 1->0 transitions must take the snap_empty_lock
72
+ * atomically with the refcount change. Go ahead and bump the
73
+ * nref here, unless it's 0, in which case we take the spinlock
74
+ * and then do the increment and remove it from the list.
7375 */
74
- if (atomic_inc_return(&realm->nref) == 1) {
75
- spin_lock(&mdsc->snap_empty_lock);
76
+ if (atomic_inc_not_zero(&realm->nref))
77
+ return;
78
+
79
+ spin_lock(&mdsc->snap_empty_lock);
80
+ if (atomic_inc_return(&realm->nref) == 1)
7681 list_del_init(&realm->empty_item);
77
- spin_unlock(&mdsc->snap_empty_lock);
78
- }
82
+ spin_unlock(&mdsc->snap_empty_lock);
7983 }
8084
8185 static void __insert_snap_realm(struct rb_root *root,
....@@ -111,6 +115,8 @@
111115 {
112116 struct ceph_snap_realm *realm;
113117
118
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
119
+
114120 realm = kzalloc(sizeof(*realm), GFP_NOFS);
115121 if (!realm)
116122 return ERR_PTR(-ENOMEM);
....@@ -124,6 +130,8 @@
124130 INIT_LIST_HEAD(&realm->inodes_with_caps);
125131 spin_lock_init(&realm->inodes_with_caps_lock);
126132 __insert_snap_realm(&mdsc->snap_realms, realm);
133
+ mdsc->num_snap_realms++;
134
+
127135 dout("create_snap_realm %llx %p\n", realm->ino, realm);
128136 return realm;
129137 }
....@@ -131,13 +139,15 @@
131139 /*
132140 * lookup the realm rooted at @ino.
133141 *
134
- * caller must hold snap_rwsem for write.
142
+ * caller must hold snap_rwsem.
135143 */
136144 static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
137145 u64 ino)
138146 {
139147 struct rb_node *n = mdsc->snap_realms.rb_node;
140148 struct ceph_snap_realm *r;
149
+
150
+ lockdep_assert_held(&mdsc->snap_rwsem);
141151
142152 while (n) {
143153 r = rb_entry(n, struct ceph_snap_realm, node);
....@@ -172,9 +182,12 @@
172182 static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
173183 struct ceph_snap_realm *realm)
174184 {
185
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
186
+
175187 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
176188
177189 rb_erase(&realm->node, &mdsc->snap_realms);
190
+ mdsc->num_snap_realms--;
178191
179192 if (realm->parent) {
180193 list_del_init(&realm->child_item);
....@@ -193,28 +206,30 @@
193206 static void __put_snap_realm(struct ceph_mds_client *mdsc,
194207 struct ceph_snap_realm *realm)
195208 {
196
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
197
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
209
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
210
+
211
+ /*
212
+ * We do not require the snap_empty_lock here, as any caller that
213
+ * increments the value must hold the snap_rwsem.
214
+ */
198215 if (atomic_dec_and_test(&realm->nref))
199216 __destroy_snap_realm(mdsc, realm);
200217 }
201218
202219 /*
203
- * caller needn't hold any locks
220
+ * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
204221 */
205222 void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
206223 struct ceph_snap_realm *realm)
207224 {
208
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
209
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
210
- if (!atomic_dec_and_test(&realm->nref))
225
+ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
211226 return;
212227
213228 if (down_write_trylock(&mdsc->snap_rwsem)) {
229
+ spin_unlock(&mdsc->snap_empty_lock);
214230 __destroy_snap_realm(mdsc, realm);
215231 up_write(&mdsc->snap_rwsem);
216232 } else {
217
- spin_lock(&mdsc->snap_empty_lock);
218233 list_add(&realm->empty_item, &mdsc->snap_empty);
219234 spin_unlock(&mdsc->snap_empty_lock);
220235 }
....@@ -230,6 +245,8 @@
230245 static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
231246 {
232247 struct ceph_snap_realm *realm;
248
+
249
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
233250
234251 spin_lock(&mdsc->snap_empty_lock);
235252 while (!list_empty(&mdsc->snap_empty)) {
....@@ -263,6 +280,8 @@
263280 u64 parentino)
264281 {
265282 struct ceph_snap_realm *parent;
283
+
284
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
266285
267286 if (realm->parent_ino == parentino)
268287 return 0;
....@@ -468,6 +487,9 @@
468487 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
469488 return;
470489 }
490
+ capsnap->cap_flush.is_capsnap = true;
491
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
492
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
471493
472494 spin_lock(&ci->i_ceph_lock);
473495 used = __ceph_caps_used(ci);
....@@ -597,13 +619,15 @@
597619 struct ceph_cap_snap *capsnap)
598620 {
599621 struct inode *inode = &ci->vfs_inode;
600
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
622
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
601623
602624 BUG_ON(capsnap->writing);
603625 capsnap->size = inode->i_size;
604626 capsnap->mtime = inode->i_mtime;
605627 capsnap->atime = inode->i_atime;
606628 capsnap->ctime = inode->i_ctime;
629
+ capsnap->btime = ci->i_btime;
630
+ capsnap->change_attr = inode_peek_iversion_raw(inode);
607631 capsnap->time_warp_seq = ci->i_time_warp_seq;
608632 capsnap->truncate_size = ci->i_truncate_size;
609633 capsnap->truncate_seq = ci->i_truncate_seq;
....@@ -646,13 +670,15 @@
646670 if (!inode)
647671 continue;
648672 spin_unlock(&realm->inodes_with_caps_lock);
649
- iput(lastinode);
673
+ /* avoid calling iput_final() while holding
674
+ * mdsc->snap_rwsem or in mds dispatch threads */
675
+ ceph_async_iput(lastinode);
650676 lastinode = inode;
651677 ceph_queue_cap_snap(ci);
652678 spin_lock(&realm->inodes_with_caps_lock);
653679 }
654680 spin_unlock(&realm->inodes_with_caps_lock);
655
- iput(lastinode);
681
+ ceph_async_iput(lastinode);
656682
657683 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
658684 }
....@@ -671,14 +697,19 @@
671697 struct ceph_mds_snap_realm *ri; /* encoded */
672698 __le64 *snaps; /* encoded */
673699 __le64 *prior_parent_snaps; /* encoded */
674
- struct ceph_snap_realm *realm = NULL;
700
+ struct ceph_snap_realm *realm;
675701 struct ceph_snap_realm *first_realm = NULL;
676
- int invalidate = 0;
702
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
703
+ int rebuild_snapcs;
677704 int err = -ENOMEM;
678705 LIST_HEAD(dirty_realms);
679706
707
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
708
+
680709 dout("update_snap_trace deletion=%d\n", deletion);
681710 more:
711
+ realm = NULL;
712
+ rebuild_snapcs = 0;
682713 ceph_decode_need(&p, e, sizeof(*ri), bad);
683714 ri = p;
684715 p += sizeof(*ri);
....@@ -702,7 +733,7 @@
702733 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
703734 if (err < 0)
704735 goto fail;
705
- invalidate += err;
736
+ rebuild_snapcs += err;
706737
707738 if (le64_to_cpu(ri->seq) > realm->seq) {
708739 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
....@@ -727,22 +758,30 @@
727758 if (realm->seq > mdsc->last_snap_seq)
728759 mdsc->last_snap_seq = realm->seq;
729760
730
- invalidate = 1;
761
+ rebuild_snapcs = 1;
731762 } else if (!realm->cached_context) {
732763 dout("update_snap_trace %llx %p seq %lld new\n",
733764 realm->ino, realm, realm->seq);
734
- invalidate = 1;
765
+ rebuild_snapcs = 1;
735766 } else {
736767 dout("update_snap_trace %llx %p seq %lld unchanged\n",
737768 realm->ino, realm, realm->seq);
738769 }
739770
740
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
741
- realm, invalidate, p, e);
771
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
772
+ realm, rebuild_snapcs, p, e);
742773
743
- /* invalidate when we reach the _end_ (root) of the trace */
744
- if (invalidate && p >= e)
745
- rebuild_snap_realms(realm, &dirty_realms);
774
+ /*
775
+ * this will always track the uppest parent realm from which
776
+ * we need to rebuild the snapshot contexts _downward_ in
777
+ * hierarchy.
778
+ */
779
+ if (rebuild_snapcs)
780
+ realm_to_rebuild = realm;
781
+
782
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
783
+ if (realm_to_rebuild && p >= e)
784
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
746785
747786 if (!first_realm)
748787 first_realm = realm;
....@@ -804,7 +843,9 @@
804843 ihold(inode);
805844 spin_unlock(&mdsc->snap_flush_lock);
806845 ceph_flush_snaps(ci, &session);
807
- iput(inode);
846
+ /* avoid calling iput_final() while holding
847
+ * session->s_mutex or in mds dispatch threads */
848
+ ceph_async_iput(inode);
808849 spin_lock(&mdsc->snap_flush_lock);
809850 }
810851 spin_unlock(&mdsc->snap_flush_lock);
....@@ -862,7 +903,7 @@
862903 ceph_snap_op_name(op), split, trace_len);
863904
864905 mutex_lock(&session->s_mutex);
865
- session->s_seq++;
906
+ inc_session_sequence(session);
866907 mutex_unlock(&session->s_mutex);
867908
868909 down_write(&mdsc->snap_rwsem);
....@@ -948,12 +989,14 @@
948989 ceph_get_snap_realm(mdsc, realm);
949990 ceph_put_snap_realm(mdsc, oldrealm);
950991
951
- iput(inode);
992
+ /* avoid calling iput_final() while holding
993
+ * mdsc->snap_rwsem or mds in dispatch threads */
994
+ ceph_async_iput(inode);
952995 continue;
953996
954997 skip_inode:
955998 spin_unlock(&ci->i_ceph_lock);
956
- iput(inode);
999
+ ceph_async_iput(inode);
9571000 }
9581001
9591002 /* we may have taken some of the old realm's children. */
....@@ -993,3 +1036,155 @@
9931036 up_write(&mdsc->snap_rwsem);
9941037 return;
9951038 }
1039
+
1040
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
1041
+ u64 snap)
1042
+{
1043
+ struct ceph_snapid_map *sm, *exist;
1044
+ struct rb_node **p, *parent;
1045
+ int ret;
1046
+
1047
+ exist = NULL;
1048
+ spin_lock(&mdsc->snapid_map_lock);
1049
+ p = &mdsc->snapid_map_tree.rb_node;
1050
+ while (*p) {
1051
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
1052
+ if (snap > exist->snap) {
1053
+ p = &(*p)->rb_left;
1054
+ } else if (snap < exist->snap) {
1055
+ p = &(*p)->rb_right;
1056
+ } else {
1057
+ if (atomic_inc_return(&exist->ref) == 1)
1058
+ list_del_init(&exist->lru);
1059
+ break;
1060
+ }
1061
+ exist = NULL;
1062
+ }
1063
+ spin_unlock(&mdsc->snapid_map_lock);
1064
+ if (exist) {
1065
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
1066
+ return exist;
1067
+ }
1068
+
1069
+ sm = kmalloc(sizeof(*sm), GFP_NOFS);
1070
+ if (!sm)
1071
+ return NULL;
1072
+
1073
+ ret = get_anon_bdev(&sm->dev);
1074
+ if (ret < 0) {
1075
+ kfree(sm);
1076
+ return NULL;
1077
+ }
1078
+
1079
+ INIT_LIST_HEAD(&sm->lru);
1080
+ atomic_set(&sm->ref, 1);
1081
+ sm->snap = snap;
1082
+
1083
+ exist = NULL;
1084
+ parent = NULL;
1085
+ p = &mdsc->snapid_map_tree.rb_node;
1086
+ spin_lock(&mdsc->snapid_map_lock);
1087
+ while (*p) {
1088
+ parent = *p;
1089
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
1090
+ if (snap > exist->snap)
1091
+ p = &(*p)->rb_left;
1092
+ else if (snap < exist->snap)
1093
+ p = &(*p)->rb_right;
1094
+ else
1095
+ break;
1096
+ exist = NULL;
1097
+ }
1098
+ if (exist) {
1099
+ if (atomic_inc_return(&exist->ref) == 1)
1100
+ list_del_init(&exist->lru);
1101
+ } else {
1102
+ rb_link_node(&sm->node, parent, p);
1103
+ rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
1104
+ }
1105
+ spin_unlock(&mdsc->snapid_map_lock);
1106
+ if (exist) {
1107
+ free_anon_bdev(sm->dev);
1108
+ kfree(sm);
1109
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
1110
+ return exist;
1111
+ }
1112
+
1113
+ dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
1114
+ return sm;
1115
+}
1116
+
1117
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
1118
+ struct ceph_snapid_map *sm)
1119
+{
1120
+ if (!sm)
1121
+ return;
1122
+ if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
1123
+ if (!RB_EMPTY_NODE(&sm->node)) {
1124
+ sm->last_used = jiffies;
1125
+ list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
1126
+ spin_unlock(&mdsc->snapid_map_lock);
1127
+ } else {
1128
+ /* already cleaned up by
1129
+ * ceph_cleanup_snapid_map() */
1130
+ spin_unlock(&mdsc->snapid_map_lock);
1131
+ kfree(sm);
1132
+ }
1133
+ }
1134
+}
1135
+
1136
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
1137
+{
1138
+ struct ceph_snapid_map *sm;
1139
+ unsigned long now;
1140
+ LIST_HEAD(to_free);
1141
+
1142
+ spin_lock(&mdsc->snapid_map_lock);
1143
+ now = jiffies;
1144
+
1145
+ while (!list_empty(&mdsc->snapid_map_lru)) {
1146
+ sm = list_first_entry(&mdsc->snapid_map_lru,
1147
+ struct ceph_snapid_map, lru);
1148
+ if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
1149
+ break;
1150
+
1151
+ rb_erase(&sm->node, &mdsc->snapid_map_tree);
1152
+ list_move(&sm->lru, &to_free);
1153
+ }
1154
+ spin_unlock(&mdsc->snapid_map_lock);
1155
+
1156
+ while (!list_empty(&to_free)) {
1157
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
1158
+ list_del(&sm->lru);
1159
+ dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
1160
+ free_anon_bdev(sm->dev);
1161
+ kfree(sm);
1162
+ }
1163
+}
1164
+
1165
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
1166
+{
1167
+ struct ceph_snapid_map *sm;
1168
+ struct rb_node *p;
1169
+ LIST_HEAD(to_free);
1170
+
1171
+ spin_lock(&mdsc->snapid_map_lock);
1172
+ while ((p = rb_first(&mdsc->snapid_map_tree))) {
1173
+ sm = rb_entry(p, struct ceph_snapid_map, node);
1174
+ rb_erase(p, &mdsc->snapid_map_tree);
1175
+ RB_CLEAR_NODE(p);
1176
+ list_move(&sm->lru, &to_free);
1177
+ }
1178
+ spin_unlock(&mdsc->snapid_map_lock);
1179
+
1180
+ while (!list_empty(&to_free)) {
1181
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
1182
+ list_del(&sm->lru);
1183
+ free_anon_bdev(sm->dev);
1184
+ if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
1185
+ pr_err("snapid map %llx -> %x still in use\n",
1186
+ sm->snap, sm->dev);
1187
+ }
1188
+ kfree(sm);
1189
+ }
1190
+}