hc
2024-05-10 cde9070d9970eef1f7ec2360586c802a16230ad8
kernel/fs/ceph/mdsmap.c
....@@ -13,34 +13,45 @@
1313
1414 #include "super.h"
1515
16
+#define CEPH_MDS_IS_READY(i, ignore_laggy) \
17
+ (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
1618
17
-/*
18
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
19
- */
20
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
19
+static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
2120 {
2221 int n = 0;
23
- int i;
24
-
25
- /* special case for one mds */
26
- if (1 == m->m_num_mds && m->m_info[0].state > 0)
27
- return 0;
22
+ int i, j;
2823
2924 /* count */
30
- for (i = 0; i < m->m_num_mds; i++)
31
- if (m->m_info[i].state > 0)
25
+ for (i = 0; i < m->possible_max_rank; i++)
26
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
3227 n++;
3328 if (n == 0)
3429 return -1;
3530
3631 /* pick */
3732 n = prandom_u32() % n;
38
- i = 0;
39
- for (i = 0; n > 0; i++, n--)
40
- while (m->m_info[i].state <= 0)
41
- i++;
33
+ for (j = 0, i = 0; i < m->possible_max_rank; i++) {
34
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
35
+ j++;
36
+ if (j > n)
37
+ break;
38
+ }
4239
4340 return i;
41
+}
42
+
43
+/*
44
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
45
+ */
46
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
47
+{
48
+ int mds;
49
+
50
+ mds = __mdsmap_get_random_mds(m, false);
51
+ if (mds == m->possible_max_rank || mds == -1)
52
+ mds = __mdsmap_get_random_mds(m, true);
53
+
54
+ return mds == m->possible_max_rank ? -1 : mds;
4455 }
4556
4657 #define __decode_and_drop_type(p, end, type, bad) \
....@@ -108,8 +119,8 @@
108119 struct ceph_mdsmap *m;
109120 const void *start = *p;
110121 int i, j, n;
111
- int err = -EINVAL;
112
- u8 mdsmap_v, mdsmap_cv;
122
+ int err;
123
+ u8 mdsmap_v;
113124 u16 mdsmap_ev;
114125
115126 m = kzalloc(sizeof(*m), GFP_NOFS);
....@@ -118,7 +129,7 @@
118129
119130 ceph_decode_need(p, end, 1 + 1, bad);
120131 mdsmap_v = ceph_decode_8(p);
121
- mdsmap_cv = ceph_decode_8(p);
132
+ *p += sizeof(u8); /* mdsmap_cv */
122133 if (mdsmap_v >= 4) {
123134 u32 mdsmap_len;
124135 ceph_decode_32_safe(p, end, mdsmap_len, bad);
....@@ -136,19 +147,33 @@
136147 m->m_session_autoclose = ceph_decode_32(p);
137148 m->m_max_file_size = ceph_decode_64(p);
138149 m->m_max_mds = ceph_decode_32(p);
139
- m->m_num_mds = m->m_max_mds;
140150
141
- m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
151
+ /*
152
+ * pick out the active nodes as the m_num_active_mds, the
153
+ * m_num_active_mds maybe larger than m_max_mds when decreasing
154
+ * the max_mds in cluster side, in other case it should less
155
+ * than or equal to m_max_mds.
156
+ */
157
+ m->m_num_active_mds = n = ceph_decode_32(p);
158
+
159
+ /*
160
+ * the possible max rank, it maybe larger than the m_num_active_mds,
161
+ * for example if the mds_max == 2 in the cluster, when the MDS(0)
162
+ * was laggy and being replaced by a new MDS, we will temporarily
163
+ * receive a new mds map with n_num_mds == 1 and the active MDS(1),
164
+ * and the mds rank >= m_num_active_mds.
165
+ */
166
+ m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
167
+
168
+ m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
142169 if (!m->m_info)
143170 goto nomem;
144171
145172 /* pick out active nodes from mds_info (state > 0) */
146
- n = ceph_decode_32(p);
147173 for (i = 0; i < n; i++) {
148174 u64 global_id;
149175 u32 namelen;
150176 s32 mds, inc, state;
151
- u64 state_seq;
152177 u8 info_v;
153178 void *info_end = NULL;
154179 struct ceph_entity_addr addr;
....@@ -156,15 +181,15 @@
156181 void *pexport_targets = NULL;
157182 struct ceph_timespec laggy_since;
158183 struct ceph_mds_info *info;
184
+ bool laggy;
159185
160186 ceph_decode_need(p, end, sizeof(u64) + 1, bad);
161187 global_id = ceph_decode_64(p);
162188 info_v= ceph_decode_8(p);
163189 if (info_v >= 4) {
164190 u32 info_len;
165
- u8 info_cv;
166191 ceph_decode_need(p, end, 1 + sizeof(u32), bad);
167
- info_cv = ceph_decode_8(p);
192
+ *p += sizeof(u8); /* info_cv */
168193 info_len = ceph_decode_32(p);
169194 info_end = *p + info_len;
170195 if (info_end > end)
....@@ -183,10 +208,12 @@
183208 mds = ceph_decode_32(p);
184209 inc = ceph_decode_32(p);
185210 state = ceph_decode_32(p);
186
- state_seq = ceph_decode_64(p);
187
- ceph_decode_copy(p, &addr, sizeof(addr));
188
- ceph_decode_addr(&addr);
211
+ *p += sizeof(u64); /* state_seq */
212
+ err = ceph_decode_entity_addr(p, end, &addr);
213
+ if (err)
214
+ goto corrupt;
189215 ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
216
+ laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
190217 *p += sizeof(u32);
191218 ceph_decode_32_safe(p, end, namelen, bad);
192219 *p += namelen;
....@@ -204,31 +231,28 @@
204231 *p = info_end;
205232 }
206233
207
- dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
234
+ dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
208235 i+1, n, global_id, mds, inc,
209
- ceph_pr_addr(&addr.in_addr),
210
- ceph_mds_state_name(state));
236
+ ceph_pr_addr(&addr),
237
+ ceph_mds_state_name(state),
238
+ laggy ? "(laggy)" : "");
211239
212
- if (mds < 0 || state <= 0)
240
+ if (mds < 0 || mds >= m->possible_max_rank) {
241
+ pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
213242 continue;
243
+ }
214244
215
- if (mds >= m->m_num_mds) {
216
- int new_num = max(mds + 1, m->m_num_mds * 2);
217
- void *new_m_info = krealloc(m->m_info,
218
- new_num * sizeof(*m->m_info),
219
- GFP_NOFS | __GFP_ZERO);
220
- if (!new_m_info)
221
- goto nomem;
222
- m->m_info = new_m_info;
223
- m->m_num_mds = new_num;
245
+ if (state <= 0) {
246
+ dout("mdsmap_decode got incorrect state(%s)\n",
247
+ ceph_mds_state_name(state));
248
+ continue;
224249 }
225250
226251 info = &m->m_info[mds];
227252 info->global_id = global_id;
228253 info->state = state;
229254 info->addr = addr;
230
- info->laggy = (laggy_since.tv_sec != 0 ||
231
- laggy_since.tv_nsec != 0);
255
+ info->laggy = laggy;
232256 info->num_export_targets = num_export_targets;
233257 if (num_export_targets) {
234258 info->export_targets = kcalloc(num_export_targets,
....@@ -241,14 +265,6 @@
241265 } else {
242266 info->export_targets = NULL;
243267 }
244
- }
245
- if (m->m_num_mds > m->m_max_mds) {
246
- /* find max up mds */
247
- for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
248
- if (i == 0 || m->m_info[i-1].state > 0)
249
- break;
250
- }
251
- m->m_num_mds = i;
252268 }
253269
254270 /* pg_pools */
....@@ -291,14 +307,14 @@
291307
292308 for (i = 0; i < n; i++) {
293309 s32 mds = ceph_decode_32(p);
294
- if (mds >= 0 && mds < m->m_num_mds) {
310
+ if (mds >= 0 && mds < m->possible_max_rank) {
295311 if (m->m_info[mds].laggy)
296312 num_laggy++;
297313 }
298314 }
299315 m->m_num_laggy = num_laggy;
300316
301
- if (n > m->m_num_mds) {
317
+ if (n > m->possible_max_rank) {
302318 void *new_m_info = krealloc(m->m_info,
303319 n * sizeof(*m->m_info),
304320 GFP_NOFS | __GFP_ZERO);
....@@ -306,7 +322,7 @@
306322 goto nomem;
307323 m->m_info = new_m_info;
308324 }
309
- m->m_num_mds = n;
325
+ m->possible_max_rank = n;
310326 }
311327
312328 /* inc */
....@@ -352,13 +368,15 @@
352368 m->m_damaged = false;
353369 }
354370 bad_ext:
371
+ dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
372
+ !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
355373 *p = end;
356374 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
357375 return m;
358376 nomem:
359377 err = -ENOMEM;
360378 goto out_err;
361
-bad:
379
+corrupt:
362380 pr_err("corrupt mdsmap\n");
363381 print_hex_dump(KERN_DEBUG, "mdsmap: ",
364382 DUMP_PREFIX_OFFSET, 16, 1,
....@@ -366,15 +384,20 @@
366384 out_err:
367385 ceph_mdsmap_destroy(m);
368386 return ERR_PTR(err);
387
+bad:
388
+ err = -EINVAL;
389
+ goto corrupt;
369390 }
370391
371392 void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
372393 {
373394 int i;
374395
375
- for (i = 0; i < m->m_num_mds; i++)
376
- kfree(m->m_info[i].export_targets);
377
- kfree(m->m_info);
396
+ if (m->m_info) {
397
+ for (i = 0; i < m->possible_max_rank; i++)
398
+ kfree(m->m_info[i].export_targets);
399
+ kfree(m->m_info);
400
+ }
378401 kfree(m->m_data_pg_pools);
379402 kfree(m);
380403 }
....@@ -386,9 +409,9 @@
386409 return false;
387410 if (m->m_damaged)
388411 return false;
389
- if (m->m_num_laggy > 0)
412
+ if (m->m_num_laggy == m->m_num_active_mds)
390413 return false;
391
- for (i = 0; i < m->m_num_mds; i++) {
414
+ for (i = 0; i < m->possible_max_rank; i++) {
392415 if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
393416 nr_active++;
394417 }