Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
3d14c5d2 | 2 | #include <linux/ceph/ceph_debug.h> |
2f2dc053 SW |
3 | |
4 | #include <linux/bug.h> | |
5 | #include <linux/err.h> | |
6 | #include <linux/random.h> | |
7 | #include <linux/slab.h> | |
8 | #include <linux/types.h> | |
9 | ||
3d14c5d2 YS |
10 | #include <linux/ceph/mdsmap.h> |
11 | #include <linux/ceph/messenger.h> | |
12 | #include <linux/ceph/decode.h> | |
2f2dc053 SW |
13 | |
14 | #include "super.h" | |
15 | ||
5d47648f | 16 | #define CEPH_MDS_IS_READY(i, ignore_laggy) \ |
b38c9eb4 | 17 | (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) |
2f2dc053 | 18 | |
5d47648f | 19 | static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) |
2f2dc053 SW |
20 | { |
21 | int n = 0; | |
74d6f030 | 22 | int i, j; |
a84cd293 | 23 | |
2f2dc053 | 24 | /* count */ |
b38c9eb4 | 25 | for (i = 0; i < m->possible_max_rank; i++) |
5d47648f | 26 | if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
2f2dc053 SW |
27 | n++; |
28 | if (n == 0) | |
29 | return -1; | |
30 | ||
31 | /* pick */ | |
a84cd293 | 32 | n = prandom_u32() % n; |
b38c9eb4 | 33 | for (j = 0, i = 0; i < m->possible_max_rank; i++) { |
5d47648f | 34 | if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
74d6f030 XL |
35 | j++; |
36 | if (j > n) | |
37 | break; | |
38 | } | |
2f2dc053 SW |
39 | |
40 | return i; | |
41 | } | |
42 | ||
5d47648f XL |
43 | /* |
44 | * choose a random mds that is "up" (i.e. has a state > 0), or -1. | |
45 | */ | |
46 | int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) | |
47 | { | |
48 | int mds; | |
49 | ||
50 | mds = __mdsmap_get_random_mds(m, false); | |
b38c9eb4 | 51 | if (mds == m->possible_max_rank || mds == -1) |
5d47648f XL |
52 | mds = __mdsmap_get_random_mds(m, true); |
53 | ||
b38c9eb4 | 54 | return mds == m->possible_max_rank ? -1 : mds; |
5d47648f XL |
55 | } |
56 | ||
e9e427f0 YZ |
57 | #define __decode_and_drop_type(p, end, type, bad) \ |
58 | do { \ | |
59 | if (*p + sizeof(type) > end) \ | |
60 | goto bad; \ | |
61 | *p += sizeof(type); \ | |
62 | } while (0) | |
63 | ||
64 | #define __decode_and_drop_set(p, end, type, bad) \ | |
65 | do { \ | |
66 | u32 n; \ | |
67 | size_t need; \ | |
68 | ceph_decode_32_safe(p, end, n, bad); \ | |
69 | need = sizeof(type) * n; \ | |
70 | ceph_decode_need(p, end, need, bad); \ | |
71 | *p += need; \ | |
72 | } while (0) | |
73 | ||
74 | #define __decode_and_drop_map(p, end, ktype, vtype, bad) \ | |
75 | do { \ | |
76 | u32 n; \ | |
77 | size_t need; \ | |
78 | ceph_decode_32_safe(p, end, n, bad); \ | |
79 | need = (sizeof(ktype) + sizeof(vtype)) * n; \ | |
80 | ceph_decode_need(p, end, need, bad); \ | |
81 | *p += need; \ | |
82 | } while (0) | |
83 | ||
84 | ||
85 | static int __decode_and_drop_compat_set(void **p, void* end) | |
86 | { | |
87 | int i; | |
88 | /* compat, ro_compat, incompat*/ | |
89 | for (i = 0; i < 3; i++) { | |
90 | u32 n; | |
91 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); | |
92 | /* mask */ | |
93 | *p += sizeof(u64); | |
94 | /* names (map<u64, string>) */ | |
95 | n = ceph_decode_32(p); | |
96 | while (n-- > 0) { | |
97 | u32 len; | |
98 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), | |
99 | bad); | |
100 | *p += sizeof(u64); | |
101 | len = ceph_decode_32(p); | |
102 | ceph_decode_need(p, end, len, bad); | |
103 | *p += len; | |
104 | } | |
105 | } | |
106 | return 0; | |
107 | bad: | |
108 | return -1; | |
109 | } | |
110 | ||
2f2dc053 SW |
111 | /* |
112 | * Decode an MDS map | |
113 | * | |
114 | * Ignore any fields we don't care about (there are quite a few of | |
115 | * them). | |
116 | */ | |
117 | struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |
118 | { | |
119 | struct ceph_mdsmap *m; | |
9ec7cab1 | 120 | const void *start = *p; |
2f2dc053 | 121 | int i, j, n; |
f3848af1 | 122 | int err; |
d463a43d | 123 | u8 mdsmap_v, mdsmap_cv; |
e9e427f0 | 124 | u16 mdsmap_ev; |
2f2dc053 SW |
125 | |
126 | m = kzalloc(sizeof(*m), GFP_NOFS); | |
d37b1d99 | 127 | if (!m) |
2f2dc053 SW |
128 | return ERR_PTR(-ENOMEM); |
129 | ||
d463a43d YZ |
130 | ceph_decode_need(p, end, 1 + 1, bad); |
131 | mdsmap_v = ceph_decode_8(p); | |
132 | mdsmap_cv = ceph_decode_8(p); | |
133 | if (mdsmap_v >= 4) { | |
134 | u32 mdsmap_len; | |
135 | ceph_decode_32_safe(p, end, mdsmap_len, bad); | |
136 | if (end < *p + mdsmap_len) | |
137 | goto bad; | |
138 | end = *p + mdsmap_len; | |
4f6a7e5e | 139 | } |
2f2dc053 SW |
140 | |
141 | ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); | |
c89136ea SW |
142 | m->m_epoch = ceph_decode_32(p); |
143 | m->m_client_epoch = ceph_decode_32(p); | |
144 | m->m_last_failure = ceph_decode_32(p); | |
145 | m->m_root = ceph_decode_32(p); | |
146 | m->m_session_timeout = ceph_decode_32(p); | |
147 | m->m_session_autoclose = ceph_decode_32(p); | |
148 | m->m_max_file_size = ceph_decode_64(p); | |
149 | m->m_max_mds = ceph_decode_32(p); | |
4d7ace02 XL |
150 | |
151 | /* | |
b38c9eb4 XL |
152 | * pick out the active nodes as the m_num_active_mds, the |
153 | * m_num_active_mds maybe larger than m_max_mds when decreasing | |
154 | * the max_mds in cluster side, in other case it should less | |
155 | * than or equal to m_max_mds. | |
4d7ace02 | 156 | */ |
b38c9eb4 | 157 | m->m_num_active_mds = n = ceph_decode_32(p); |
4d7ace02 XL |
158 | |
159 | /* | |
b38c9eb4 | 160 | * the possible max rank, it maybe larger than the m_num_active_mds, |
4d7ace02 XL |
161 | * for example if the mds_max == 2 in the cluster, when the MDS(0) |
162 | * was laggy and being replaced by a new MDS, we will temporarily | |
163 | * receive a new mds map with n_num_mds == 1 and the active MDS(1), | |
b38c9eb4 | 164 | * and the mds rank >= m_num_active_mds. |
4d7ace02 | 165 | */ |
b38c9eb4 | 166 | m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); |
2f2dc053 | 167 | |
b38c9eb4 | 168 | m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); |
d37b1d99 | 169 | if (!m->m_info) |
e9e427f0 | 170 | goto nomem; |
2f2dc053 SW |
171 | |
172 | /* pick out active nodes from mds_info (state > 0) */ | |
2f2dc053 | 173 | for (i = 0; i < n; i++) { |
94045e11 | 174 | u64 global_id; |
2f2dc053 SW |
175 | u32 namelen; |
176 | s32 mds, inc, state; | |
177 | u64 state_seq; | |
d463a43d YZ |
178 | u8 info_v; |
179 | void *info_end = NULL; | |
2f2dc053 SW |
180 | struct ceph_entity_addr addr; |
181 | u32 num_export_targets; | |
182 | void *pexport_targets = NULL; | |
0deb01c9 | 183 | struct ceph_timespec laggy_since; |
6af86528 | 184 | struct ceph_mds_info *info; |
da08e1e1 | 185 | bool laggy; |
2f2dc053 | 186 | |
d463a43d | 187 | ceph_decode_need(p, end, sizeof(u64) + 1, bad); |
94045e11 | 188 | global_id = ceph_decode_64(p); |
d463a43d YZ |
189 | info_v= ceph_decode_8(p); |
190 | if (info_v >= 4) { | |
191 | u32 info_len; | |
192 | u8 info_cv; | |
193 | ceph_decode_need(p, end, 1 + sizeof(u32), bad); | |
194 | info_cv = ceph_decode_8(p); | |
195 | info_len = ceph_decode_32(p); | |
196 | info_end = *p + info_len; | |
197 | if (info_end > end) | |
198 | goto bad; | |
199 | } | |
200 | ||
201 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); | |
94045e11 | 202 | *p += sizeof(u64); |
c89136ea | 203 | namelen = ceph_decode_32(p); /* skip mds name */ |
2f2dc053 SW |
204 | *p += namelen; |
205 | ||
206 | ceph_decode_need(p, end, | |
e251e288 | 207 | 4*sizeof(u32) + sizeof(u64) + |
2f2dc053 SW |
208 | sizeof(addr) + sizeof(struct ceph_timespec), |
209 | bad); | |
c89136ea SW |
210 | mds = ceph_decode_32(p); |
211 | inc = ceph_decode_32(p); | |
212 | state = ceph_decode_32(p); | |
213 | state_seq = ceph_decode_64(p); | |
f3848af1 JL |
214 | err = ceph_decode_entity_addr(p, end, &addr); |
215 | if (err) | |
216 | goto corrupt; | |
0deb01c9 | 217 | ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); |
da08e1e1 | 218 | laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; |
2f2dc053 SW |
219 | *p += sizeof(u32); |
220 | ceph_decode_32_safe(p, end, namelen, bad); | |
e251e288 | 221 | *p += namelen; |
d463a43d | 222 | if (info_v >= 2) { |
2f2dc053 SW |
223 | ceph_decode_32_safe(p, end, num_export_targets, bad); |
224 | pexport_targets = *p; | |
e251e288 | 225 | *p += num_export_targets * sizeof(u32); |
2f2dc053 SW |
226 | } else { |
227 | num_export_targets = 0; | |
228 | } | |
229 | ||
d463a43d YZ |
230 | if (info_end && *p != info_end) { |
231 | if (*p > info_end) | |
232 | goto bad; | |
233 | *p = info_end; | |
234 | } | |
235 | ||
da08e1e1 | 236 | dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", |
3d14c5d2 | 237 | i+1, n, global_id, mds, inc, |
b726ec97 | 238 | ceph_pr_addr(&addr), |
da08e1e1 XL |
239 | ceph_mds_state_name(state), |
240 | laggy ? "(laggy)" : ""); | |
6af86528 | 241 | |
b38c9eb4 | 242 | if (mds < 0 || mds >= m->possible_max_rank) { |
4d7ace02 | 243 | pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); |
6af86528 | 244 | continue; |
4d7ace02 | 245 | } |
6af86528 | 246 | |
4d7ace02 XL |
247 | if (state <= 0) { |
248 | pr_warn("mdsmap_decode got incorrect state(%s)\n", | |
249 | ceph_mds_state_name(state)); | |
250 | continue; | |
76201b63 YZ |
251 | } |
252 | ||
6af86528 DC |
253 | info = &m->m_info[mds]; |
254 | info->global_id = global_id; | |
255 | info->state = state; | |
256 | info->addr = addr; | |
da08e1e1 | 257 | info->laggy = laggy; |
6af86528 DC |
258 | info->num_export_targets = num_export_targets; |
259 | if (num_export_targets) { | |
260 | info->export_targets = kcalloc(num_export_targets, | |
261 | sizeof(u32), GFP_NOFS); | |
d37b1d99 | 262 | if (!info->export_targets) |
e9e427f0 | 263 | goto nomem; |
6af86528 DC |
264 | for (j = 0; j < num_export_targets; j++) |
265 | info->export_targets[j] = | |
266 | ceph_decode_32(&pexport_targets); | |
267 | } else { | |
268 | info->export_targets = NULL; | |
2f2dc053 SW |
269 | } |
270 | } | |
271 | ||
272 | /* pg_pools */ | |
273 | ceph_decode_32_safe(p, end, n, bad); | |
274 | m->m_num_data_pg_pools = n; | |
4f6a7e5e | 275 | m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); |
2f2dc053 | 276 | if (!m->m_data_pg_pools) |
e9e427f0 | 277 | goto nomem; |
4f6a7e5e | 278 | ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); |
2f2dc053 | 279 | for (i = 0; i < n; i++) |
4f6a7e5e SW |
280 | m->m_data_pg_pools[i] = ceph_decode_64(p); |
281 | m->m_cas_pg_pool = ceph_decode_64(p); | |
e9e427f0 YZ |
282 | m->m_enabled = m->m_epoch > 1; |
283 | ||
284 | mdsmap_ev = 1; | |
285 | if (mdsmap_v >= 2) { | |
286 | ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext); | |
287 | } | |
288 | if (mdsmap_ev >= 3) { | |
289 | if (__decode_and_drop_compat_set(p, end) < 0) | |
290 | goto bad_ext; | |
291 | } | |
292 | /* metadata_pool */ | |
293 | if (mdsmap_ev < 5) { | |
294 | __decode_and_drop_type(p, end, u32, bad_ext); | |
295 | } else { | |
296 | __decode_and_drop_type(p, end, u64, bad_ext); | |
297 | } | |
2f2dc053 | 298 | |
e9e427f0 YZ |
299 | /* created + modified + tableserver */ |
300 | __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); | |
301 | __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); | |
302 | __decode_and_drop_type(p, end, u32, bad_ext); | |
303 | ||
304 | /* in */ | |
305 | { | |
306 | int num_laggy = 0; | |
307 | ceph_decode_32_safe(p, end, n, bad_ext); | |
308 | ceph_decode_need(p, end, sizeof(u32) * n, bad_ext); | |
309 | ||
310 | for (i = 0; i < n; i++) { | |
311 | s32 mds = ceph_decode_32(p); | |
b38c9eb4 | 312 | if (mds >= 0 && mds < m->possible_max_rank) { |
e9e427f0 YZ |
313 | if (m->m_info[mds].laggy) |
314 | num_laggy++; | |
315 | } | |
316 | } | |
317 | m->m_num_laggy = num_laggy; | |
76201b63 | 318 | |
b38c9eb4 | 319 | if (n > m->possible_max_rank) { |
76201b63 YZ |
320 | void *new_m_info = krealloc(m->m_info, |
321 | n * sizeof(*m->m_info), | |
322 | GFP_NOFS | __GFP_ZERO); | |
323 | if (!new_m_info) | |
324 | goto nomem; | |
325 | m->m_info = new_m_info; | |
326 | } | |
b38c9eb4 | 327 | m->possible_max_rank = n; |
e9e427f0 YZ |
328 | } |
329 | ||
330 | /* inc */ | |
331 | __decode_and_drop_map(p, end, u32, u32, bad_ext); | |
332 | /* up */ | |
333 | __decode_and_drop_map(p, end, u32, u64, bad_ext); | |
334 | /* failed */ | |
335 | __decode_and_drop_set(p, end, u32, bad_ext); | |
336 | /* stopped */ | |
337 | __decode_and_drop_set(p, end, u32, bad_ext); | |
338 | ||
339 | if (mdsmap_ev >= 4) { | |
340 | /* last_failure_osd_epoch */ | |
341 | __decode_and_drop_type(p, end, u32, bad_ext); | |
342 | } | |
343 | if (mdsmap_ev >= 6) { | |
344 | /* ever_allowed_snaps */ | |
345 | __decode_and_drop_type(p, end, u8, bad_ext); | |
346 | /* explicitly_allowed_snaps */ | |
347 | __decode_and_drop_type(p, end, u8, bad_ext); | |
348 | } | |
349 | if (mdsmap_ev >= 7) { | |
350 | /* inline_data_enabled */ | |
351 | __decode_and_drop_type(p, end, u8, bad_ext); | |
352 | } | |
353 | if (mdsmap_ev >= 8) { | |
354 | u32 name_len; | |
355 | /* enabled */ | |
356 | ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); | |
357 | ceph_decode_32_safe(p, end, name_len, bad_ext); | |
358 | ceph_decode_need(p, end, name_len, bad_ext); | |
359 | *p += name_len; | |
360 | } | |
361 | /* damaged */ | |
362 | if (mdsmap_ev >= 9) { | |
363 | size_t need; | |
364 | ceph_decode_32_safe(p, end, n, bad_ext); | |
365 | need = sizeof(u32) * n; | |
366 | ceph_decode_need(p, end, need, bad_ext); | |
367 | *p += need; | |
368 | m->m_damaged = n > 0; | |
369 | } else { | |
370 | m->m_damaged = false; | |
371 | } | |
372 | bad_ext: | |
da08e1e1 XL |
373 | dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", |
374 | !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); | |
d463a43d | 375 | *p = end; |
2f2dc053 SW |
376 | dout("mdsmap_decode success epoch %u\n", m->m_epoch); |
377 | return m; | |
e9e427f0 | 378 | nomem: |
2f2dc053 | 379 | err = -ENOMEM; |
e9e427f0 | 380 | goto out_err; |
f3848af1 | 381 | corrupt: |
2f2dc053 | 382 | pr_err("corrupt mdsmap\n"); |
9ec7cab1 SW |
383 | print_hex_dump(KERN_DEBUG, "mdsmap: ", |
384 | DUMP_PREFIX_OFFSET, 16, 1, | |
385 | start, end - start, true); | |
e9e427f0 | 386 | out_err: |
2f2dc053 | 387 | ceph_mdsmap_destroy(m); |
c213b50b | 388 | return ERR_PTR(err); |
f3848af1 JL |
389 | bad: |
390 | err = -EINVAL; | |
391 | goto corrupt; | |
2f2dc053 SW |
392 | } |
393 | ||
394 | void ceph_mdsmap_destroy(struct ceph_mdsmap *m) | |
395 | { | |
396 | int i; | |
397 | ||
b38c9eb4 | 398 | for (i = 0; i < m->possible_max_rank; i++) |
2f2dc053 SW |
399 | kfree(m->m_info[i].export_targets); |
400 | kfree(m->m_info); | |
401 | kfree(m->m_data_pg_pools); | |
402 | kfree(m); | |
403 | } | |
e9e427f0 YZ |
404 | |
405 | bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) | |
406 | { | |
407 | int i, nr_active = 0; | |
408 | if (!m->m_enabled) | |
409 | return false; | |
410 | if (m->m_damaged) | |
411 | return false; | |
4d7ace02 | 412 | if (m->m_num_laggy == m->m_num_active_mds) |
e9e427f0 | 413 | return false; |
b38c9eb4 | 414 | for (i = 0; i < m->possible_max_rank; i++) { |
e9e427f0 YZ |
415 | if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) |
416 | nr_active++; | |
417 | } | |
418 | return nr_active > 0; | |
419 | } |