From e636c8d336489bf3eed5878299e6cc045bbad077 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:17:29 +0000
Subject: [PATCH] debug lk
---
kernel/fs/ceph/mdsmap.c | 141 +++++++++++++++++++++++++++-------------------
1 files changed, 82 insertions(+), 59 deletions(-)
diff --git a/kernel/fs/ceph/mdsmap.c b/kernel/fs/ceph/mdsmap.c
index 44e53ab..47f2903 100644
--- a/kernel/fs/ceph/mdsmap.c
+++ b/kernel/fs/ceph/mdsmap.c
@@ -13,34 +13,45 @@
#include "super.h"
+#define CEPH_MDS_IS_READY(i, ignore_laggy) \
+ (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
-/*
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
- */
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
{
int n = 0;
- int i;
-
- /* special case for one mds */
- if (1 == m->m_num_mds && m->m_info[0].state > 0)
- return 0;
+ int i, j;
/* count */
- for (i = 0; i < m->m_num_mds; i++)
- if (m->m_info[i].state > 0)
+ for (i = 0; i < m->possible_max_rank; i++)
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
n++;
if (n == 0)
return -1;
/* pick */
n = prandom_u32() % n;
- i = 0;
- for (i = 0; n > 0; i++, n--)
- while (m->m_info[i].state <= 0)
- i++;
+ for (j = 0, i = 0; i < m->possible_max_rank; i++) {
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
+ j++;
+ if (j > n)
+ break;
+ }
return i;
+}
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+ int mds;
+
+ mds = __mdsmap_get_random_mds(m, false);
+ if (mds == m->possible_max_rank || mds == -1)
+ mds = __mdsmap_get_random_mds(m, true);
+
+ return mds == m->possible_max_rank ? -1 : mds;
}
#define __decode_and_drop_type(p, end, type, bad) \
@@ -108,8 +119,8 @@
struct ceph_mdsmap *m;
const void *start = *p;
int i, j, n;
- int err = -EINVAL;
- u8 mdsmap_v, mdsmap_cv;
+ int err;
+ u8 mdsmap_v;
u16 mdsmap_ev;
m = kzalloc(sizeof(*m), GFP_NOFS);
@@ -118,7 +129,7 @@
ceph_decode_need(p, end, 1 + 1, bad);
mdsmap_v = ceph_decode_8(p);
- mdsmap_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* mdsmap_cv */
if (mdsmap_v >= 4) {
u32 mdsmap_len;
ceph_decode_32_safe(p, end, mdsmap_len, bad);
@@ -136,19 +147,33 @@
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
- m->m_num_mds = m->m_max_mds;
- m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
+ /*
+ * pick out the active nodes as the m_num_active_mds, the
+ * m_num_active_mds maybe larger than m_max_mds when decreasing
+ * the max_mds in cluster side, in other case it should less
+ * than or equal to m_max_mds.
+ */
+ m->m_num_active_mds = n = ceph_decode_32(p);
+
+ /*
+ * the possible max rank, it maybe larger than the m_num_active_mds,
+ * for example if the mds_max == 2 in the cluster, when the MDS(0)
+ * was laggy and being replaced by a new MDS, we will temporarily
+ * receive a new mds map with n_num_mds == 1 and the active MDS(1),
+ * and the mds rank >= m_num_active_mds.
+ */
+ m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
+
+ m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
if (!m->m_info)
goto nomem;
/* pick out active nodes from mds_info (state > 0) */
- n = ceph_decode_32(p);
for (i = 0; i < n; i++) {
u64 global_id;
u32 namelen;
s32 mds, inc, state;
- u64 state_seq;
u8 info_v;
void *info_end = NULL;
struct ceph_entity_addr addr;
@@ -156,15 +181,15 @@
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
struct ceph_mds_info *info;
+ bool laggy;
ceph_decode_need(p, end, sizeof(u64) + 1, bad);
global_id = ceph_decode_64(p);
info_v= ceph_decode_8(p);
if (info_v >= 4) {
u32 info_len;
- u8 info_cv;
ceph_decode_need(p, end, 1 + sizeof(u32), bad);
- info_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* info_cv */
info_len = ceph_decode_32(p);
info_end = *p + info_len;
if (info_end > end)
@@ -183,10 +208,12 @@
mds = ceph_decode_32(p);
inc = ceph_decode_32(p);
state = ceph_decode_32(p);
- state_seq = ceph_decode_64(p);
- ceph_decode_copy(p, &addr, sizeof(addr));
- ceph_decode_addr(&addr);
+ *p += sizeof(u64); /* state_seq */
+ err = ceph_decode_entity_addr(p, end, &addr);
+ if (err)
+ goto corrupt;
ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+ laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
*p += namelen;
@@ -204,31 +231,28 @@
*p = info_end;
}
- dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+ dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
i+1, n, global_id, mds, inc,
- ceph_pr_addr(&addr.in_addr),
- ceph_mds_state_name(state));
+ ceph_pr_addr(&addr),
+ ceph_mds_state_name(state),
+ laggy ? "(laggy)" : "");
- if (mds < 0 || state <= 0)
+ if (mds < 0 || mds >= m->possible_max_rank) {
+ pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
continue;
+ }
- if (mds >= m->m_num_mds) {
- int new_num = max(mds + 1, m->m_num_mds * 2);
- void *new_m_info = krealloc(m->m_info,
- new_num * sizeof(*m->m_info),
- GFP_NOFS | __GFP_ZERO);
- if (!new_m_info)
- goto nomem;
- m->m_info = new_m_info;
- m->m_num_mds = new_num;
+ if (state <= 0) {
+ dout("mdsmap_decode got incorrect state(%s)\n",
+ ceph_mds_state_name(state));
+ continue;
}
info = &m->m_info[mds];
info->global_id = global_id;
info->state = state;
info->addr = addr;
- info->laggy = (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
+ info->laggy = laggy;
info->num_export_targets = num_export_targets;
if (num_export_targets) {
info->export_targets = kcalloc(num_export_targets,
@@ -241,14 +265,6 @@
} else {
info->export_targets = NULL;
}
- }
- if (m->m_num_mds > m->m_max_mds) {
- /* find max up mds */
- for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
- if (i == 0 || m->m_info[i-1].state > 0)
- break;
- }
- m->m_num_mds = i;
}
/* pg_pools */
@@ -291,14 +307,14 @@
for (i = 0; i < n; i++) {
s32 mds = ceph_decode_32(p);
- if (mds >= 0 && mds < m->m_num_mds) {
+ if (mds >= 0 && mds < m->possible_max_rank) {
if (m->m_info[mds].laggy)
num_laggy++;
}
}
m->m_num_laggy = num_laggy;
- if (n > m->m_num_mds) {
+ if (n > m->possible_max_rank) {
void *new_m_info = krealloc(m->m_info,
n * sizeof(*m->m_info),
GFP_NOFS | __GFP_ZERO);
@@ -306,7 +322,7 @@
goto nomem;
m->m_info = new_m_info;
}
- m->m_num_mds = n;
+ m->possible_max_rank = n;
}
/* inc */
@@ -352,13 +368,15 @@
m->m_damaged = false;
}
bad_ext:
+ dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+ !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
*p = end;
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
return m;
nomem:
err = -ENOMEM;
goto out_err;
-bad:
+corrupt:
pr_err("corrupt mdsmap\n");
print_hex_dump(KERN_DEBUG, "mdsmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
@@ -366,15 +384,20 @@
out_err:
ceph_mdsmap_destroy(m);
return ERR_PTR(err);
+bad:
+ err = -EINVAL;
+ goto corrupt;
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->m_num_mds; i++)
- kfree(m->m_info[i].export_targets);
- kfree(m->m_info);
+ if (m->m_info) {
+ for (i = 0; i < m->possible_max_rank; i++)
+ kfree(m->m_info[i].export_targets);
+ kfree(m->m_info);
+ }
kfree(m->m_data_pg_pools);
kfree(m);
}
@@ -386,9 +409,9 @@
return false;
if (m->m_damaged)
return false;
- if (m->m_num_laggy > 0)
+ if (m->m_num_laggy == m->m_num_active_mds)
return false;
- for (i = 0; i < m->m_num_mds; i++) {
+ for (i = 0; i < m->possible_max_rank; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
nr_active++;
}
--
Gitblit v1.6.2