| .. | .. |
|---|
| 13 | 13 | |
|---|
| 14 | 14 | #include "super.h" |
|---|
| 15 | 15 | |
|---|
| 16 | +#define CEPH_MDS_IS_READY(i, ignore_laggy) \ |
|---|
| 17 | + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) |
|---|
| 16 | 18 | |
|---|
| 17 | | -/* |
|---|
| 18 | | - * choose a random mds that is "up" (i.e. has a state > 0), or -1. |
|---|
| 19 | | - */ |
|---|
| 20 | | -int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) |
|---|
| 19 | +static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) |
|---|
| 21 | 20 | { |
|---|
| 22 | 21 | int n = 0; |
|---|
| 23 | | - int i; |
|---|
| 24 | | - |
|---|
| 25 | | - /* special case for one mds */ |
|---|
| 26 | | - if (1 == m->m_num_mds && m->m_info[0].state > 0) |
|---|
| 27 | | - return 0; |
|---|
| 22 | + int i, j; |
|---|
| 28 | 23 | |
|---|
| 29 | 24 | /* count */ |
|---|
| 30 | | - for (i = 0; i < m->m_num_mds; i++) |
|---|
| 31 | | - if (m->m_info[i].state > 0) |
|---|
| 25 | + for (i = 0; i < m->possible_max_rank; i++) |
|---|
| 26 | + if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
|---|
| 32 | 27 | n++; |
|---|
| 33 | 28 | if (n == 0) |
|---|
| 34 | 29 | return -1; |
|---|
| 35 | 30 | |
|---|
| 36 | 31 | /* pick */ |
|---|
| 37 | 32 | n = prandom_u32() % n; |
|---|
| 38 | | - i = 0; |
|---|
| 39 | | - for (i = 0; n > 0; i++, n--) |
|---|
| 40 | | - while (m->m_info[i].state <= 0) |
|---|
| 41 | | - i++; |
|---|
| 33 | + for (j = 0, i = 0; i < m->possible_max_rank; i++) { |
|---|
| 34 | + if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
|---|
| 35 | + j++; |
|---|
| 36 | + if (j > n) |
|---|
| 37 | + break; |
|---|
| 38 | + } |
|---|
| 42 | 39 | |
|---|
| 43 | 40 | return i; |
|---|
| 41 | +} |
|---|
| 42 | + |
|---|
| 43 | +/* |
|---|
| 44 | + * choose a random mds that is "up" (i.e. has a state > 0), or -1. |
|---|
| 45 | + */ |
|---|
| 46 | +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) |
|---|
| 47 | +{ |
|---|
| 48 | + int mds; |
|---|
| 49 | + |
|---|
| 50 | + mds = __mdsmap_get_random_mds(m, false); |
|---|
| 51 | + if (mds == m->possible_max_rank || mds == -1) |
|---|
| 52 | + mds = __mdsmap_get_random_mds(m, true); |
|---|
| 53 | + |
|---|
| 54 | + return mds == m->possible_max_rank ? -1 : mds; |
|---|
| 44 | 55 | } |
|---|
| 45 | 56 | |
|---|
| 46 | 57 | #define __decode_and_drop_type(p, end, type, bad) \ |
|---|
| .. | .. |
|---|
| 108 | 119 | struct ceph_mdsmap *m; |
|---|
| 109 | 120 | const void *start = *p; |
|---|
| 110 | 121 | int i, j, n; |
|---|
| 111 | | - int err = -EINVAL; |
|---|
| 112 | | - u8 mdsmap_v, mdsmap_cv; |
|---|
| 122 | + int err; |
|---|
| 123 | + u8 mdsmap_v; |
|---|
| 113 | 124 | u16 mdsmap_ev; |
|---|
| 114 | 125 | |
|---|
| 115 | 126 | m = kzalloc(sizeof(*m), GFP_NOFS); |
|---|
| .. | .. |
|---|
| 118 | 129 | |
|---|
| 119 | 130 | ceph_decode_need(p, end, 1 + 1, bad); |
|---|
| 120 | 131 | mdsmap_v = ceph_decode_8(p); |
|---|
| 121 | | - mdsmap_cv = ceph_decode_8(p); |
|---|
| 132 | + *p += sizeof(u8); /* mdsmap_cv */ |
|---|
| 122 | 133 | if (mdsmap_v >= 4) { |
|---|
| 123 | 134 | u32 mdsmap_len; |
|---|
| 124 | 135 | ceph_decode_32_safe(p, end, mdsmap_len, bad); |
|---|
| .. | .. |
|---|
| 136 | 147 | m->m_session_autoclose = ceph_decode_32(p); |
|---|
| 137 | 148 | m->m_max_file_size = ceph_decode_64(p); |
|---|
| 138 | 149 | m->m_max_mds = ceph_decode_32(p); |
|---|
| 139 | | - m->m_num_mds = m->m_max_mds; |
|---|
| 140 | 150 | |
|---|
| 141 | | - m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); |
|---|
| 151 | + /* |
|---|
| 152 | + * pick out the active nodes as the m_num_active_mds, the |
|---|
| 153 | + * m_num_active_mds maybe larger than m_max_mds when decreasing |
|---|
| 154 | + * the max_mds in cluster side, in other case it should less |
|---|
| 155 | + * than or equal to m_max_mds. |
|---|
| 156 | + */ |
|---|
| 157 | + m->m_num_active_mds = n = ceph_decode_32(p); |
|---|
| 158 | + |
|---|
| 159 | + /* |
|---|
| 160 | + * the possible max rank, it maybe larger than the m_num_active_mds, |
|---|
| 161 | + * for example if the mds_max == 2 in the cluster, when the MDS(0) |
|---|
| 162 | + * was laggy and being replaced by a new MDS, we will temporarily |
|---|
| 163 | + * receive a new mds map with n_num_mds == 1 and the active MDS(1), |
|---|
| 164 | + * and the mds rank >= m_num_active_mds. |
|---|
| 165 | + */ |
|---|
| 166 | + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); |
|---|
| 167 | + |
|---|
| 168 | + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); |
|---|
| 142 | 169 | if (!m->m_info) |
|---|
| 143 | 170 | goto nomem; |
|---|
| 144 | 171 | |
|---|
| 145 | 172 | /* pick out active nodes from mds_info (state > 0) */ |
|---|
| 146 | | - n = ceph_decode_32(p); |
|---|
| 147 | 173 | for (i = 0; i < n; i++) { |
|---|
| 148 | 174 | u64 global_id; |
|---|
| 149 | 175 | u32 namelen; |
|---|
| 150 | 176 | s32 mds, inc, state; |
|---|
| 151 | | - u64 state_seq; |
|---|
| 152 | 177 | u8 info_v; |
|---|
| 153 | 178 | void *info_end = NULL; |
|---|
| 154 | 179 | struct ceph_entity_addr addr; |
|---|
| .. | .. |
|---|
| 156 | 181 | void *pexport_targets = NULL; |
|---|
| 157 | 182 | struct ceph_timespec laggy_since; |
|---|
| 158 | 183 | struct ceph_mds_info *info; |
|---|
| 184 | + bool laggy; |
|---|
| 159 | 185 | |
|---|
| 160 | 186 | ceph_decode_need(p, end, sizeof(u64) + 1, bad); |
|---|
| 161 | 187 | global_id = ceph_decode_64(p); |
|---|
| 162 | 188 | info_v= ceph_decode_8(p); |
|---|
| 163 | 189 | if (info_v >= 4) { |
|---|
| 164 | 190 | u32 info_len; |
|---|
| 165 | | - u8 info_cv; |
|---|
| 166 | 191 | ceph_decode_need(p, end, 1 + sizeof(u32), bad); |
|---|
| 167 | | - info_cv = ceph_decode_8(p); |
|---|
| 192 | + *p += sizeof(u8); /* info_cv */ |
|---|
| 168 | 193 | info_len = ceph_decode_32(p); |
|---|
| 169 | 194 | info_end = *p + info_len; |
|---|
| 170 | 195 | if (info_end > end) |
|---|
| .. | .. |
|---|
| 183 | 208 | mds = ceph_decode_32(p); |
|---|
| 184 | 209 | inc = ceph_decode_32(p); |
|---|
| 185 | 210 | state = ceph_decode_32(p); |
|---|
| 186 | | - state_seq = ceph_decode_64(p); |
|---|
| 187 | | - ceph_decode_copy(p, &addr, sizeof(addr)); |
|---|
| 188 | | - ceph_decode_addr(&addr); |
|---|
| 211 | + *p += sizeof(u64); /* state_seq */ |
|---|
| 212 | + err = ceph_decode_entity_addr(p, end, &addr); |
|---|
| 213 | + if (err) |
|---|
| 214 | + goto corrupt; |
|---|
| 189 | 215 | ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); |
|---|
| 216 | + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; |
|---|
| 190 | 217 | *p += sizeof(u32); |
|---|
| 191 | 218 | ceph_decode_32_safe(p, end, namelen, bad); |
|---|
| 192 | 219 | *p += namelen; |
|---|
| .. | .. |
|---|
| 204 | 231 | *p = info_end; |
|---|
| 205 | 232 | } |
|---|
| 206 | 233 | |
|---|
| 207 | | - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", |
|---|
| 234 | + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", |
|---|
| 208 | 235 | i+1, n, global_id, mds, inc, |
|---|
| 209 | | - ceph_pr_addr(&addr.in_addr), |
|---|
| 210 | | - ceph_mds_state_name(state)); |
|---|
| 236 | + ceph_pr_addr(&addr), |
|---|
| 237 | + ceph_mds_state_name(state), |
|---|
| 238 | + laggy ? "(laggy)" : ""); |
|---|
| 211 | 239 | |
|---|
| 212 | | - if (mds < 0 || state <= 0) |
|---|
| 240 | + if (mds < 0 || mds >= m->possible_max_rank) { |
|---|
| 241 | + pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); |
|---|
| 213 | 242 | continue; |
|---|
| 243 | + } |
|---|
| 214 | 244 | |
|---|
| 215 | | - if (mds >= m->m_num_mds) { |
|---|
| 216 | | - int new_num = max(mds + 1, m->m_num_mds * 2); |
|---|
| 217 | | - void *new_m_info = krealloc(m->m_info, |
|---|
| 218 | | - new_num * sizeof(*m->m_info), |
|---|
| 219 | | - GFP_NOFS | __GFP_ZERO); |
|---|
| 220 | | - if (!new_m_info) |
|---|
| 221 | | - goto nomem; |
|---|
| 222 | | - m->m_info = new_m_info; |
|---|
| 223 | | - m->m_num_mds = new_num; |
|---|
| 245 | + if (state <= 0) { |
|---|
| 246 | + dout("mdsmap_decode got incorrect state(%s)\n", |
|---|
| 247 | + ceph_mds_state_name(state)); |
|---|
| 248 | + continue; |
|---|
| 224 | 249 | } |
|---|
| 225 | 250 | |
|---|
| 226 | 251 | info = &m->m_info[mds]; |
|---|
| 227 | 252 | info->global_id = global_id; |
|---|
| 228 | 253 | info->state = state; |
|---|
| 229 | 254 | info->addr = addr; |
|---|
| 230 | | - info->laggy = (laggy_since.tv_sec != 0 || |
|---|
| 231 | | - laggy_since.tv_nsec != 0); |
|---|
| 255 | + info->laggy = laggy; |
|---|
| 232 | 256 | info->num_export_targets = num_export_targets; |
|---|
| 233 | 257 | if (num_export_targets) { |
|---|
| 234 | 258 | info->export_targets = kcalloc(num_export_targets, |
|---|
| .. | .. |
|---|
| 241 | 265 | } else { |
|---|
| 242 | 266 | info->export_targets = NULL; |
|---|
| 243 | 267 | } |
|---|
| 244 | | - } |
|---|
| 245 | | - if (m->m_num_mds > m->m_max_mds) { |
|---|
| 246 | | - /* find max up mds */ |
|---|
| 247 | | - for (i = m->m_num_mds; i >= m->m_max_mds; i--) { |
|---|
| 248 | | - if (i == 0 || m->m_info[i-1].state > 0) |
|---|
| 249 | | - break; |
|---|
| 250 | | - } |
|---|
| 251 | | - m->m_num_mds = i; |
|---|
| 252 | 268 | } |
|---|
| 253 | 269 | |
|---|
| 254 | 270 | /* pg_pools */ |
|---|
| .. | .. |
|---|
| 291 | 307 | |
|---|
| 292 | 308 | for (i = 0; i < n; i++) { |
|---|
| 293 | 309 | s32 mds = ceph_decode_32(p); |
|---|
| 294 | | - if (mds >= 0 && mds < m->m_num_mds) { |
|---|
| 310 | + if (mds >= 0 && mds < m->possible_max_rank) { |
|---|
| 295 | 311 | if (m->m_info[mds].laggy) |
|---|
| 296 | 312 | num_laggy++; |
|---|
| 297 | 313 | } |
|---|
| 298 | 314 | } |
|---|
| 299 | 315 | m->m_num_laggy = num_laggy; |
|---|
| 300 | 316 | |
|---|
| 301 | | - if (n > m->m_num_mds) { |
|---|
| 317 | + if (n > m->possible_max_rank) { |
|---|
| 302 | 318 | void *new_m_info = krealloc(m->m_info, |
|---|
| 303 | 319 | n * sizeof(*m->m_info), |
|---|
| 304 | 320 | GFP_NOFS | __GFP_ZERO); |
|---|
| .. | .. |
|---|
| 306 | 322 | goto nomem; |
|---|
| 307 | 323 | m->m_info = new_m_info; |
|---|
| 308 | 324 | } |
|---|
| 309 | | - m->m_num_mds = n; |
|---|
| 325 | + m->possible_max_rank = n; |
|---|
| 310 | 326 | } |
|---|
| 311 | 327 | |
|---|
| 312 | 328 | /* inc */ |
|---|
| .. | .. |
|---|
| 352 | 368 | m->m_damaged = false; |
|---|
| 353 | 369 | } |
|---|
| 354 | 370 | bad_ext: |
|---|
| 371 | + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", |
|---|
| 372 | + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); |
|---|
| 355 | 373 | *p = end; |
|---|
| 356 | 374 | dout("mdsmap_decode success epoch %u\n", m->m_epoch); |
|---|
| 357 | 375 | return m; |
|---|
| 358 | 376 | nomem: |
|---|
| 359 | 377 | err = -ENOMEM; |
|---|
| 360 | 378 | goto out_err; |
|---|
| 361 | | -bad: |
|---|
| 379 | +corrupt: |
|---|
| 362 | 380 | pr_err("corrupt mdsmap\n"); |
|---|
| 363 | 381 | print_hex_dump(KERN_DEBUG, "mdsmap: ", |
|---|
| 364 | 382 | DUMP_PREFIX_OFFSET, 16, 1, |
|---|
| .. | .. |
|---|
| 366 | 384 | out_err: |
|---|
| 367 | 385 | ceph_mdsmap_destroy(m); |
|---|
| 368 | 386 | return ERR_PTR(err); |
|---|
| 387 | +bad: |
|---|
| 388 | + err = -EINVAL; |
|---|
| 389 | + goto corrupt; |
|---|
| 369 | 390 | } |
|---|
| 370 | 391 | |
|---|
| 371 | 392 | void ceph_mdsmap_destroy(struct ceph_mdsmap *m) |
|---|
| 372 | 393 | { |
|---|
| 373 | 394 | int i; |
|---|
| 374 | 395 | |
|---|
| 375 | | - for (i = 0; i < m->m_num_mds; i++) |
|---|
| 376 | | - kfree(m->m_info[i].export_targets); |
|---|
| 377 | | - kfree(m->m_info); |
|---|
| 396 | + if (m->m_info) { |
|---|
| 397 | + for (i = 0; i < m->possible_max_rank; i++) |
|---|
| 398 | + kfree(m->m_info[i].export_targets); |
|---|
| 399 | + kfree(m->m_info); |
|---|
| 400 | + } |
|---|
| 378 | 401 | kfree(m->m_data_pg_pools); |
|---|
| 379 | 402 | kfree(m); |
|---|
| 380 | 403 | } |
|---|
| .. | .. |
|---|
| 386 | 409 | return false; |
|---|
| 387 | 410 | if (m->m_damaged) |
|---|
| 388 | 411 | return false; |
|---|
| 389 | | - if (m->m_num_laggy > 0) |
|---|
| 412 | + if (m->m_num_laggy == m->m_num_active_mds) |
|---|
| 390 | 413 | return false; |
|---|
| 391 | | - for (i = 0; i < m->m_num_mds; i++) { |
|---|
| 414 | + for (i = 0; i < m->possible_max_rank; i++) { |
|---|
| 392 | 415 | if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) |
|---|
| 393 | 416 | nr_active++; |
|---|
| 394 | 417 | } |
|---|