.. | .. |
---|
13 | 13 | |
---|
14 | 14 | #include "super.h" |
---|
15 | 15 | |
---|
| 16 | +#define CEPH_MDS_IS_READY(i, ignore_laggy) \ |
---|
| 17 | + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) |
---|
16 | 18 | |
---|
17 | | -/* |
---|
18 | | - * choose a random mds that is "up" (i.e. has a state > 0), or -1. |
---|
19 | | - */ |
---|
20 | | -int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) |
---|
| 19 | +static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) |
---|
21 | 20 | { |
---|
22 | 21 | int n = 0; |
---|
23 | | - int i; |
---|
24 | | - |
---|
25 | | - /* special case for one mds */ |
---|
26 | | - if (1 == m->m_num_mds && m->m_info[0].state > 0) |
---|
27 | | - return 0; |
---|
| 22 | + int i, j; |
---|
28 | 23 | |
---|
29 | 24 | /* count */ |
---|
30 | | - for (i = 0; i < m->m_num_mds; i++) |
---|
31 | | - if (m->m_info[i].state > 0) |
---|
| 25 | + for (i = 0; i < m->possible_max_rank; i++) |
---|
| 26 | + if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
---|
32 | 27 | n++; |
---|
33 | 28 | if (n == 0) |
---|
34 | 29 | return -1; |
---|
35 | 30 | |
---|
36 | 31 | /* pick */ |
---|
37 | 32 | n = prandom_u32() % n; |
---|
38 | | - i = 0; |
---|
39 | | - for (i = 0; n > 0; i++, n--) |
---|
40 | | - while (m->m_info[i].state <= 0) |
---|
41 | | - i++; |
---|
| 33 | + for (j = 0, i = 0; i < m->possible_max_rank; i++) { |
---|
| 34 | + if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
---|
| 35 | + j++; |
---|
| 36 | + if (j > n) |
---|
| 37 | + break; |
---|
| 38 | + } |
---|
42 | 39 | |
---|
43 | 40 | return i; |
---|
| 41 | +} |
---|
| 42 | + |
---|
| 43 | +/* |
---|
| 44 | + * choose a random mds that is "up" (i.e. has a state > 0), or -1. |
---|
| 45 | + */ |
---|
| 46 | +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) |
---|
| 47 | +{ |
---|
| 48 | + int mds; |
---|
| 49 | + |
---|
| 50 | + mds = __mdsmap_get_random_mds(m, false); |
---|
| 51 | + if (mds == m->possible_max_rank || mds == -1) |
---|
| 52 | + mds = __mdsmap_get_random_mds(m, true); |
---|
| 53 | + |
---|
| 54 | + return mds == m->possible_max_rank ? -1 : mds; |
---|
44 | 55 | } |
---|
45 | 56 | |
---|
46 | 57 | #define __decode_and_drop_type(p, end, type, bad) \ |
---|
.. | .. |
---|
108 | 119 | struct ceph_mdsmap *m; |
---|
109 | 120 | const void *start = *p; |
---|
110 | 121 | int i, j, n; |
---|
111 | | - int err = -EINVAL; |
---|
112 | | - u8 mdsmap_v, mdsmap_cv; |
---|
| 122 | + int err; |
---|
| 123 | + u8 mdsmap_v; |
---|
113 | 124 | u16 mdsmap_ev; |
---|
114 | 125 | |
---|
115 | 126 | m = kzalloc(sizeof(*m), GFP_NOFS); |
---|
.. | .. |
---|
118 | 129 | |
---|
119 | 130 | ceph_decode_need(p, end, 1 + 1, bad); |
---|
120 | 131 | mdsmap_v = ceph_decode_8(p); |
---|
121 | | - mdsmap_cv = ceph_decode_8(p); |
---|
| 132 | + *p += sizeof(u8); /* mdsmap_cv */ |
---|
122 | 133 | if (mdsmap_v >= 4) { |
---|
123 | 134 | u32 mdsmap_len; |
---|
124 | 135 | ceph_decode_32_safe(p, end, mdsmap_len, bad); |
---|
.. | .. |
---|
136 | 147 | m->m_session_autoclose = ceph_decode_32(p); |
---|
137 | 148 | m->m_max_file_size = ceph_decode_64(p); |
---|
138 | 149 | m->m_max_mds = ceph_decode_32(p); |
---|
139 | | - m->m_num_mds = m->m_max_mds; |
---|
140 | 150 | |
---|
141 | | - m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); |
---|
| 151 | + /* |
---|
| 152 | + * pick out the active nodes as the m_num_active_mds, the |
---|
| 153 | + * m_num_active_mds maybe larger than m_max_mds when decreasing |
---|
| 154 | + * the max_mds in cluster side, in other case it should less |
---|
| 155 | + * than or equal to m_max_mds. |
---|
| 156 | + */ |
---|
| 157 | + m->m_num_active_mds = n = ceph_decode_32(p); |
---|
| 158 | + |
---|
| 159 | + /* |
---|
| 160 | + * the possible max rank, it maybe larger than the m_num_active_mds, |
---|
| 161 | + * for example if the mds_max == 2 in the cluster, when the MDS(0) |
---|
| 162 | + * was laggy and being replaced by a new MDS, we will temporarily |
---|
| 163 | + * receive a new mds map with n_num_mds == 1 and the active MDS(1), |
---|
| 164 | + * and the mds rank >= m_num_active_mds. |
---|
| 165 | + */ |
---|
| 166 | + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); |
---|
| 167 | + |
---|
| 168 | + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); |
---|
142 | 169 | if (!m->m_info) |
---|
143 | 170 | goto nomem; |
---|
144 | 171 | |
---|
145 | 172 | /* pick out active nodes from mds_info (state > 0) */ |
---|
146 | | - n = ceph_decode_32(p); |
---|
147 | 173 | for (i = 0; i < n; i++) { |
---|
148 | 174 | u64 global_id; |
---|
149 | 175 | u32 namelen; |
---|
150 | 176 | s32 mds, inc, state; |
---|
151 | | - u64 state_seq; |
---|
152 | 177 | u8 info_v; |
---|
153 | 178 | void *info_end = NULL; |
---|
154 | 179 | struct ceph_entity_addr addr; |
---|
.. | .. |
---|
156 | 181 | void *pexport_targets = NULL; |
---|
157 | 182 | struct ceph_timespec laggy_since; |
---|
158 | 183 | struct ceph_mds_info *info; |
---|
| 184 | + bool laggy; |
---|
159 | 185 | |
---|
160 | 186 | ceph_decode_need(p, end, sizeof(u64) + 1, bad); |
---|
161 | 187 | global_id = ceph_decode_64(p); |
---|
162 | 188 | info_v= ceph_decode_8(p); |
---|
163 | 189 | if (info_v >= 4) { |
---|
164 | 190 | u32 info_len; |
---|
165 | | - u8 info_cv; |
---|
166 | 191 | ceph_decode_need(p, end, 1 + sizeof(u32), bad); |
---|
167 | | - info_cv = ceph_decode_8(p); |
---|
| 192 | + *p += sizeof(u8); /* info_cv */ |
---|
168 | 193 | info_len = ceph_decode_32(p); |
---|
169 | 194 | info_end = *p + info_len; |
---|
170 | 195 | if (info_end > end) |
---|
.. | .. |
---|
183 | 208 | mds = ceph_decode_32(p); |
---|
184 | 209 | inc = ceph_decode_32(p); |
---|
185 | 210 | state = ceph_decode_32(p); |
---|
186 | | - state_seq = ceph_decode_64(p); |
---|
187 | | - ceph_decode_copy(p, &addr, sizeof(addr)); |
---|
188 | | - ceph_decode_addr(&addr); |
---|
| 211 | + *p += sizeof(u64); /* state_seq */ |
---|
| 212 | + err = ceph_decode_entity_addr(p, end, &addr); |
---|
| 213 | + if (err) |
---|
| 214 | + goto corrupt; |
---|
189 | 215 | ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); |
---|
| 216 | + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; |
---|
190 | 217 | *p += sizeof(u32); |
---|
191 | 218 | ceph_decode_32_safe(p, end, namelen, bad); |
---|
192 | 219 | *p += namelen; |
---|
.. | .. |
---|
204 | 231 | *p = info_end; |
---|
205 | 232 | } |
---|
206 | 233 | |
---|
207 | | - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", |
---|
| 234 | + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", |
---|
208 | 235 | i+1, n, global_id, mds, inc, |
---|
209 | | - ceph_pr_addr(&addr.in_addr), |
---|
210 | | - ceph_mds_state_name(state)); |
---|
| 236 | + ceph_pr_addr(&addr), |
---|
| 237 | + ceph_mds_state_name(state), |
---|
| 238 | + laggy ? "(laggy)" : ""); |
---|
211 | 239 | |
---|
212 | | - if (mds < 0 || state <= 0) |
---|
| 240 | + if (mds < 0 || mds >= m->possible_max_rank) { |
---|
| 241 | + pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); |
---|
213 | 242 | continue; |
---|
| 243 | + } |
---|
214 | 244 | |
---|
215 | | - if (mds >= m->m_num_mds) { |
---|
216 | | - int new_num = max(mds + 1, m->m_num_mds * 2); |
---|
217 | | - void *new_m_info = krealloc(m->m_info, |
---|
218 | | - new_num * sizeof(*m->m_info), |
---|
219 | | - GFP_NOFS | __GFP_ZERO); |
---|
220 | | - if (!new_m_info) |
---|
221 | | - goto nomem; |
---|
222 | | - m->m_info = new_m_info; |
---|
223 | | - m->m_num_mds = new_num; |
---|
| 245 | + if (state <= 0) { |
---|
| 246 | + dout("mdsmap_decode got incorrect state(%s)\n", |
---|
| 247 | + ceph_mds_state_name(state)); |
---|
| 248 | + continue; |
---|
224 | 249 | } |
---|
225 | 250 | |
---|
226 | 251 | info = &m->m_info[mds]; |
---|
227 | 252 | info->global_id = global_id; |
---|
228 | 253 | info->state = state; |
---|
229 | 254 | info->addr = addr; |
---|
230 | | - info->laggy = (laggy_since.tv_sec != 0 || |
---|
231 | | - laggy_since.tv_nsec != 0); |
---|
| 255 | + info->laggy = laggy; |
---|
232 | 256 | info->num_export_targets = num_export_targets; |
---|
233 | 257 | if (num_export_targets) { |
---|
234 | 258 | info->export_targets = kcalloc(num_export_targets, |
---|
.. | .. |
---|
241 | 265 | } else { |
---|
242 | 266 | info->export_targets = NULL; |
---|
243 | 267 | } |
---|
244 | | - } |
---|
245 | | - if (m->m_num_mds > m->m_max_mds) { |
---|
246 | | - /* find max up mds */ |
---|
247 | | - for (i = m->m_num_mds; i >= m->m_max_mds; i--) { |
---|
248 | | - if (i == 0 || m->m_info[i-1].state > 0) |
---|
249 | | - break; |
---|
250 | | - } |
---|
251 | | - m->m_num_mds = i; |
---|
252 | 268 | } |
---|
253 | 269 | |
---|
254 | 270 | /* pg_pools */ |
---|
.. | .. |
---|
291 | 307 | |
---|
292 | 308 | for (i = 0; i < n; i++) { |
---|
293 | 309 | s32 mds = ceph_decode_32(p); |
---|
294 | | - if (mds >= 0 && mds < m->m_num_mds) { |
---|
| 310 | + if (mds >= 0 && mds < m->possible_max_rank) { |
---|
295 | 311 | if (m->m_info[mds].laggy) |
---|
296 | 312 | num_laggy++; |
---|
297 | 313 | } |
---|
298 | 314 | } |
---|
299 | 315 | m->m_num_laggy = num_laggy; |
---|
300 | 316 | |
---|
301 | | - if (n > m->m_num_mds) { |
---|
| 317 | + if (n > m->possible_max_rank) { |
---|
302 | 318 | void *new_m_info = krealloc(m->m_info, |
---|
303 | 319 | n * sizeof(*m->m_info), |
---|
304 | 320 | GFP_NOFS | __GFP_ZERO); |
---|
.. | .. |
---|
306 | 322 | goto nomem; |
---|
307 | 323 | m->m_info = new_m_info; |
---|
308 | 324 | } |
---|
309 | | - m->m_num_mds = n; |
---|
| 325 | + m->possible_max_rank = n; |
---|
310 | 326 | } |
---|
311 | 327 | |
---|
312 | 328 | /* inc */ |
---|
.. | .. |
---|
352 | 368 | m->m_damaged = false; |
---|
353 | 369 | } |
---|
354 | 370 | bad_ext: |
---|
| 371 | + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", |
---|
| 372 | + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); |
---|
355 | 373 | *p = end; |
---|
356 | 374 | dout("mdsmap_decode success epoch %u\n", m->m_epoch); |
---|
357 | 375 | return m; |
---|
358 | 376 | nomem: |
---|
359 | 377 | err = -ENOMEM; |
---|
360 | 378 | goto out_err; |
---|
361 | | -bad: |
---|
| 379 | +corrupt: |
---|
362 | 380 | pr_err("corrupt mdsmap\n"); |
---|
363 | 381 | print_hex_dump(KERN_DEBUG, "mdsmap: ", |
---|
364 | 382 | DUMP_PREFIX_OFFSET, 16, 1, |
---|
.. | .. |
---|
366 | 384 | out_err: |
---|
367 | 385 | ceph_mdsmap_destroy(m); |
---|
368 | 386 | return ERR_PTR(err); |
---|
| 387 | +bad: |
---|
| 388 | + err = -EINVAL; |
---|
| 389 | + goto corrupt; |
---|
369 | 390 | } |
---|
370 | 391 | |
---|
371 | 392 | void ceph_mdsmap_destroy(struct ceph_mdsmap *m) |
---|
372 | 393 | { |
---|
373 | 394 | int i; |
---|
374 | 395 | |
---|
375 | | - for (i = 0; i < m->m_num_mds; i++) |
---|
376 | | - kfree(m->m_info[i].export_targets); |
---|
377 | | - kfree(m->m_info); |
---|
| 396 | + if (m->m_info) { |
---|
| 397 | + for (i = 0; i < m->possible_max_rank; i++) |
---|
| 398 | + kfree(m->m_info[i].export_targets); |
---|
| 399 | + kfree(m->m_info); |
---|
| 400 | + } |
---|
378 | 401 | kfree(m->m_data_pg_pools); |
---|
379 | 402 | kfree(m); |
---|
380 | 403 | } |
---|
.. | .. |
---|
386 | 409 | return false; |
---|
387 | 410 | if (m->m_damaged) |
---|
388 | 411 | return false; |
---|
389 | | - if (m->m_num_laggy > 0) |
---|
| 412 | + if (m->m_num_laggy == m->m_num_active_mds) |
---|
390 | 413 | return false; |
---|
391 | | - for (i = 0; i < m->m_num_mds; i++) { |
---|
| 414 | + for (i = 0; i < m->possible_max_rank; i++) { |
---|
392 | 415 | if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) |
---|
393 | 416 | nr_active++; |
---|
394 | 417 | } |
---|