Merge tag 'landlock-6.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/mic...
[linux-block.git] / net / ceph / messenger_v2.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Ceph msgr2 protocol implementation
4  *
5  * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
6  */
7
8 #include <linux/ceph/ceph_debug.h>
9
10 #include <crypto/aead.h>
11 #include <crypto/algapi.h>  /* for crypto_memneq() */
12 #include <crypto/hash.h>
13 #include <crypto/sha2.h>
14 #include <linux/bvec.h>
15 #include <linux/crc32c.h>
16 #include <linux/net.h>
17 #include <linux/scatterlist.h>
18 #include <linux/socket.h>
19 #include <linux/sched/mm.h>
20 #include <net/sock.h>
21 #include <net/tcp.h>
22
23 #include <linux/ceph/ceph_features.h>
24 #include <linux/ceph/decode.h>
25 #include <linux/ceph/libceph.h>
26 #include <linux/ceph/messenger.h>
27
28 #include "crypto.h"  /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
29
30 #define FRAME_TAG_HELLO                 1
31 #define FRAME_TAG_AUTH_REQUEST          2
32 #define FRAME_TAG_AUTH_BAD_METHOD       3
33 #define FRAME_TAG_AUTH_REPLY_MORE       4
34 #define FRAME_TAG_AUTH_REQUEST_MORE     5
35 #define FRAME_TAG_AUTH_DONE             6
36 #define FRAME_TAG_AUTH_SIGNATURE        7
37 #define FRAME_TAG_CLIENT_IDENT          8
38 #define FRAME_TAG_SERVER_IDENT          9
39 #define FRAME_TAG_IDENT_MISSING_FEATURES 10
40 #define FRAME_TAG_SESSION_RECONNECT     11
41 #define FRAME_TAG_SESSION_RESET         12
42 #define FRAME_TAG_SESSION_RETRY         13
43 #define FRAME_TAG_SESSION_RETRY_GLOBAL  14
44 #define FRAME_TAG_SESSION_RECONNECT_OK  15
45 #define FRAME_TAG_WAIT                  16
46 #define FRAME_TAG_MESSAGE               17
47 #define FRAME_TAG_KEEPALIVE2            18
48 #define FRAME_TAG_KEEPALIVE2_ACK        19
49 #define FRAME_TAG_ACK                   20
50
51 #define FRAME_LATE_STATUS_ABORTED       0x1
52 #define FRAME_LATE_STATUS_COMPLETE      0xe
53 #define FRAME_LATE_STATUS_ABORTED_MASK  0xf
54
55 #define IN_S_HANDLE_PREAMBLE            1
56 #define IN_S_HANDLE_CONTROL             2
57 #define IN_S_HANDLE_CONTROL_REMAINDER   3
58 #define IN_S_PREPARE_READ_DATA          4
59 #define IN_S_PREPARE_READ_DATA_CONT     5
60 #define IN_S_PREPARE_READ_ENC_PAGE      6
61 #define IN_S_HANDLE_EPILOGUE            7
62 #define IN_S_FINISH_SKIP                8
63
64 #define OUT_S_QUEUE_DATA                1
65 #define OUT_S_QUEUE_DATA_CONT           2
66 #define OUT_S_QUEUE_ENC_PAGE            3
67 #define OUT_S_QUEUE_ZEROS               4
68 #define OUT_S_FINISH_MESSAGE            5
69 #define OUT_S_GET_NEXT                  6
70
71 #define CTRL_BODY(p)    ((void *)(p) + CEPH_PREAMBLE_LEN)
72 #define FRONT_PAD(p)    ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
73 #define MIDDLE_PAD(p)   (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
74 #define DATA_PAD(p)     (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
75
76 #define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
77
78 static int do_recvmsg(struct socket *sock, struct iov_iter *it)
79 {
80         struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
81         int ret;
82
83         msg.msg_iter = *it;
84         while (iov_iter_count(it)) {
85                 ret = sock_recvmsg(sock, &msg, msg.msg_flags);
86                 if (ret <= 0) {
87                         if (ret == -EAGAIN)
88                                 ret = 0;
89                         return ret;
90                 }
91
92                 iov_iter_advance(it, ret);
93         }
94
95         WARN_ON(msg_data_left(&msg));
96         return 1;
97 }
98
99 /*
100  * Read as much as possible.
101  *
102  * Return:
103  *   1 - done, nothing (else) to read
104  *   0 - socket is empty, need to wait
105  *  <0 - error
106  */
107 static int ceph_tcp_recv(struct ceph_connection *con)
108 {
109         int ret;
110
111         dout("%s con %p %s %zu\n", __func__, con,
112              iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
113              iov_iter_count(&con->v2.in_iter));
114         ret = do_recvmsg(con->sock, &con->v2.in_iter);
115         dout("%s con %p ret %d left %zu\n", __func__, con, ret,
116              iov_iter_count(&con->v2.in_iter));
117         return ret;
118 }
119
120 static int do_sendmsg(struct socket *sock, struct iov_iter *it)
121 {
122         struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
123         int ret;
124
125         msg.msg_iter = *it;
126         while (iov_iter_count(it)) {
127                 ret = sock_sendmsg(sock, &msg);
128                 if (ret <= 0) {
129                         if (ret == -EAGAIN)
130                                 ret = 0;
131                         return ret;
132                 }
133
134                 iov_iter_advance(it, ret);
135         }
136
137         WARN_ON(msg_data_left(&msg));
138         return 1;
139 }
140
141 static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
142 {
143         struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
144         struct bio_vec bv;
145         int ret;
146
147         if (WARN_ON(!iov_iter_is_bvec(it)))
148                 return -EINVAL;
149
150         while (iov_iter_count(it)) {
151                 /* iov_iter_iovec() for ITER_BVEC */
152                 bvec_set_page(&bv, it->bvec->bv_page,
153                               min(iov_iter_count(it),
154                                   it->bvec->bv_len - it->iov_offset),
155                               it->bvec->bv_offset + it->iov_offset);
156
157                 /*
158                  * sendpage cannot properly handle pages with
159                  * page_count == 0, we need to fall back to sendmsg if
160                  * that's the case.
161                  *
162                  * Same goes for slab pages: skb_can_coalesce() allows
163                  * coalescing neighboring slab objects into a single frag
164                  * which triggers one of hardened usercopy checks.
165                  */
166                 if (sendpage_ok(bv.bv_page)) {
167                         ret = sock->ops->sendpage(sock, bv.bv_page,
168                                                   bv.bv_offset, bv.bv_len,
169                                                   CEPH_MSG_FLAGS);
170                 } else {
171                         iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
172                         ret = sock_sendmsg(sock, &msg);
173                 }
174                 if (ret <= 0) {
175                         if (ret == -EAGAIN)
176                                 ret = 0;
177                         return ret;
178                 }
179
180                 iov_iter_advance(it, ret);
181         }
182
183         return 1;
184 }
185
186 /*
187  * Write as much as possible.  The socket is expected to be corked,
188  * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
189  *
190  * Return:
191  *   1 - done, nothing (else) to write
192  *   0 - socket is full, need to wait
193  *  <0 - error
194  */
195 static int ceph_tcp_send(struct ceph_connection *con)
196 {
197         int ret;
198
199         dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
200              iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
201         if (con->v2.out_iter_sendpage)
202                 ret = do_try_sendpage(con->sock, &con->v2.out_iter);
203         else
204                 ret = do_sendmsg(con->sock, &con->v2.out_iter);
205         dout("%s con %p ret %d left %zu\n", __func__, con, ret,
206              iov_iter_count(&con->v2.out_iter));
207         return ret;
208 }
209
210 static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
211 {
212         BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
213         WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
214
215         con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
216         con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
217         con->v2.in_kvec_cnt++;
218
219         con->v2.in_iter.nr_segs++;
220         con->v2.in_iter.count += len;
221 }
222
223 static void reset_in_kvecs(struct ceph_connection *con)
224 {
225         WARN_ON(iov_iter_count(&con->v2.in_iter));
226
227         con->v2.in_kvec_cnt = 0;
228         iov_iter_kvec(&con->v2.in_iter, ITER_DEST, con->v2.in_kvecs, 0, 0);
229 }
230
231 static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
232 {
233         WARN_ON(iov_iter_count(&con->v2.in_iter));
234
235         con->v2.in_bvec = *bv;
236         iov_iter_bvec(&con->v2.in_iter, ITER_DEST, &con->v2.in_bvec, 1, bv->bv_len);
237 }
238
239 static void set_in_skip(struct ceph_connection *con, int len)
240 {
241         WARN_ON(iov_iter_count(&con->v2.in_iter));
242
243         dout("%s con %p len %d\n", __func__, con, len);
244         iov_iter_discard(&con->v2.in_iter, ITER_DEST, len);
245 }
246
247 static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
248 {
249         BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
250         WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
251         WARN_ON(con->v2.out_zero);
252
253         con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
254         con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
255         con->v2.out_kvec_cnt++;
256
257         con->v2.out_iter.nr_segs++;
258         con->v2.out_iter.count += len;
259 }
260
261 static void reset_out_kvecs(struct ceph_connection *con)
262 {
263         WARN_ON(iov_iter_count(&con->v2.out_iter));
264         WARN_ON(con->v2.out_zero);
265
266         con->v2.out_kvec_cnt = 0;
267
268         iov_iter_kvec(&con->v2.out_iter, ITER_SOURCE, con->v2.out_kvecs, 0, 0);
269         con->v2.out_iter_sendpage = false;
270 }
271
272 static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
273                          bool zerocopy)
274 {
275         WARN_ON(iov_iter_count(&con->v2.out_iter));
276         WARN_ON(con->v2.out_zero);
277
278         con->v2.out_bvec = *bv;
279         con->v2.out_iter_sendpage = zerocopy;
280         iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1,
281                       con->v2.out_bvec.bv_len);
282 }
283
284 static void set_out_bvec_zero(struct ceph_connection *con)
285 {
286         WARN_ON(iov_iter_count(&con->v2.out_iter));
287         WARN_ON(!con->v2.out_zero);
288
289         bvec_set_page(&con->v2.out_bvec, ceph_zero_page,
290                       min(con->v2.out_zero, (int)PAGE_SIZE), 0);
291         con->v2.out_iter_sendpage = true;
292         iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1,
293                       con->v2.out_bvec.bv_len);
294 }
295
296 static void out_zero_add(struct ceph_connection *con, int len)
297 {
298         dout("%s con %p len %d\n", __func__, con, len);
299         con->v2.out_zero += len;
300 }
301
302 static void *alloc_conn_buf(struct ceph_connection *con, int len)
303 {
304         void *buf;
305
306         dout("%s con %p len %d\n", __func__, con, len);
307
308         if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
309                 return NULL;
310
311         buf = kvmalloc(len, GFP_NOIO);
312         if (!buf)
313                 return NULL;
314
315         con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
316         return buf;
317 }
318
319 static void free_conn_bufs(struct ceph_connection *con)
320 {
321         while (con->v2.conn_buf_cnt)
322                 kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
323 }
324
325 static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
326 {
327         BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
328
329         con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
330         con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
331         con->v2.in_sign_kvec_cnt++;
332 }
333
334 static void clear_in_sign_kvecs(struct ceph_connection *con)
335 {
336         con->v2.in_sign_kvec_cnt = 0;
337 }
338
339 static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
340 {
341         BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
342
343         con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
344         con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
345         con->v2.out_sign_kvec_cnt++;
346 }
347
348 static void clear_out_sign_kvecs(struct ceph_connection *con)
349 {
350         con->v2.out_sign_kvec_cnt = 0;
351 }
352
353 static bool con_secure(struct ceph_connection *con)
354 {
355         return con->v2.con_mode == CEPH_CON_MODE_SECURE;
356 }
357
358 static int front_len(const struct ceph_msg *msg)
359 {
360         return le32_to_cpu(msg->hdr.front_len);
361 }
362
363 static int middle_len(const struct ceph_msg *msg)
364 {
365         return le32_to_cpu(msg->hdr.middle_len);
366 }
367
368 static int data_len(const struct ceph_msg *msg)
369 {
370         return le32_to_cpu(msg->hdr.data_len);
371 }
372
373 static bool need_padding(int len)
374 {
375         return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
376 }
377
378 static int padded_len(int len)
379 {
380         return ALIGN(len, CEPH_GCM_BLOCK_LEN);
381 }
382
383 static int padding_len(int len)
384 {
385         return padded_len(len) - len;
386 }
387
388 /* preamble + control segment */
389 static int head_onwire_len(int ctrl_len, bool secure)
390 {
391         int head_len;
392         int rem_len;
393
394         if (secure) {
395                 head_len = CEPH_PREAMBLE_SECURE_LEN;
396                 if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
397                         rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
398                         head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
399                 }
400         } else {
401                 head_len = CEPH_PREAMBLE_PLAIN_LEN;
402                 if (ctrl_len)
403                         head_len += ctrl_len + CEPH_CRC_LEN;
404         }
405         return head_len;
406 }
407
408 /* front, middle and data segments + epilogue */
409 static int __tail_onwire_len(int front_len, int middle_len, int data_len,
410                              bool secure)
411 {
412         if (!front_len && !middle_len && !data_len)
413                 return 0;
414
415         if (!secure)
416                 return front_len + middle_len + data_len +
417                        CEPH_EPILOGUE_PLAIN_LEN;
418
419         return padded_len(front_len) + padded_len(middle_len) +
420                padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
421 }
422
423 static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
424 {
425         return __tail_onwire_len(front_len(msg), middle_len(msg),
426                                  data_len(msg), secure);
427 }
428
429 /* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
430 #define MESSAGE_HEAD_PLAIN_LEN  (CEPH_PREAMBLE_PLAIN_LEN +              \
431                                  sizeof(struct ceph_msg_header2) +      \
432                                  CEPH_CRC_LEN)
433
434 static const int frame_aligns[] = {
435         sizeof(void *),
436         sizeof(void *),
437         sizeof(void *),
438         PAGE_SIZE
439 };
440
441 /*
442  * Discards trailing empty segments, unless there is just one segment.
443  * A frame always has at least one (possibly empty) segment.
444  */
445 static int calc_segment_count(const int *lens, int len_cnt)
446 {
447         int i;
448
449         for (i = len_cnt - 1; i >= 0; i--) {
450                 if (lens[i])
451                         return i + 1;
452         }
453
454         return 1;
455 }
456
457 static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
458                             const int *lens, int len_cnt)
459 {
460         int i;
461
462         memset(desc, 0, sizeof(*desc));
463
464         desc->fd_tag = tag;
465         desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
466         BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
467         for (i = 0; i < desc->fd_seg_cnt; i++) {
468                 desc->fd_lens[i] = lens[i];
469                 desc->fd_aligns[i] = frame_aligns[i];
470         }
471 }
472
473 /*
474  * Preamble crc covers everything up to itself (28 bytes) and
475  * is calculated and verified irrespective of the connection mode
476  * (i.e. even if the frame is encrypted).
477  */
478 static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
479 {
480         void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
481         void *start = p;
482         int i;
483
484         memset(p, 0, CEPH_PREAMBLE_LEN);
485
486         ceph_encode_8(&p, desc->fd_tag);
487         ceph_encode_8(&p, desc->fd_seg_cnt);
488         for (i = 0; i < desc->fd_seg_cnt; i++) {
489                 ceph_encode_32(&p, desc->fd_lens[i]);
490                 ceph_encode_16(&p, desc->fd_aligns[i]);
491         }
492
493         put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
494 }
495
496 static int decode_preamble(void *p, struct ceph_frame_desc *desc)
497 {
498         void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
499         u32 crc, expected_crc;
500         int i;
501
502         crc = crc32c(0, p, crcp - p);
503         expected_crc = get_unaligned_le32(crcp);
504         if (crc != expected_crc) {
505                 pr_err("bad preamble crc, calculated %u, expected %u\n",
506                        crc, expected_crc);
507                 return -EBADMSG;
508         }
509
510         memset(desc, 0, sizeof(*desc));
511
512         desc->fd_tag = ceph_decode_8(&p);
513         desc->fd_seg_cnt = ceph_decode_8(&p);
514         if (desc->fd_seg_cnt < 1 ||
515             desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
516                 pr_err("bad segment count %d\n", desc->fd_seg_cnt);
517                 return -EINVAL;
518         }
519         for (i = 0; i < desc->fd_seg_cnt; i++) {
520                 desc->fd_lens[i] = ceph_decode_32(&p);
521                 desc->fd_aligns[i] = ceph_decode_16(&p);
522         }
523
524         /*
525          * This would fire for FRAME_TAG_WAIT (it has one empty
526          * segment), but we should never get it as client.
527          */
528         if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
529                 pr_err("last segment empty\n");
530                 return -EINVAL;
531         }
532
533         if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
534                 pr_err("control segment too big %d\n", desc->fd_lens[0]);
535                 return -EINVAL;
536         }
537         if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
538                 pr_err("front segment too big %d\n", desc->fd_lens[1]);
539                 return -EINVAL;
540         }
541         if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
542                 pr_err("middle segment too big %d\n", desc->fd_lens[2]);
543                 return -EINVAL;
544         }
545         if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
546                 pr_err("data segment too big %d\n", desc->fd_lens[3]);
547                 return -EINVAL;
548         }
549
550         return 0;
551 }
552
553 static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
554 {
555         con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
556                                                  FRAME_LATE_STATUS_COMPLETE;
557         cpu_to_le32s(&con->v2.out_epil.front_crc);
558         cpu_to_le32s(&con->v2.out_epil.middle_crc);
559         cpu_to_le32s(&con->v2.out_epil.data_crc);
560 }
561
562 static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
563 {
564         memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
565         con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
566                                                  FRAME_LATE_STATUS_COMPLETE;
567 }
568
569 static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
570                            u32 *data_crc)
571 {
572         u8 late_status;
573
574         late_status = ceph_decode_8(&p);
575         if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
576                         FRAME_LATE_STATUS_COMPLETE) {
577                 /* we should never get an aborted message as client */
578                 pr_err("bad late_status 0x%x\n", late_status);
579                 return -EINVAL;
580         }
581
582         if (front_crc && middle_crc && data_crc) {
583                 *front_crc = ceph_decode_32(&p);
584                 *middle_crc = ceph_decode_32(&p);
585                 *data_crc = ceph_decode_32(&p);
586         }
587
588         return 0;
589 }
590
591 static void fill_header(struct ceph_msg_header *hdr,
592                         const struct ceph_msg_header2 *hdr2,
593                         int front_len, int middle_len, int data_len,
594                         const struct ceph_entity_name *peer_name)
595 {
596         hdr->seq = hdr2->seq;
597         hdr->tid = hdr2->tid;
598         hdr->type = hdr2->type;
599         hdr->priority = hdr2->priority;
600         hdr->version = hdr2->version;
601         hdr->front_len = cpu_to_le32(front_len);
602         hdr->middle_len = cpu_to_le32(middle_len);
603         hdr->data_len = cpu_to_le32(data_len);
604         hdr->data_off = hdr2->data_off;
605         hdr->src = *peer_name;
606         hdr->compat_version = hdr2->compat_version;
607         hdr->reserved = 0;
608         hdr->crc = 0;
609 }
610
611 static void fill_header2(struct ceph_msg_header2 *hdr2,
612                          const struct ceph_msg_header *hdr, u64 ack_seq)
613 {
614         hdr2->seq = hdr->seq;
615         hdr2->tid = hdr->tid;
616         hdr2->type = hdr->type;
617         hdr2->priority = hdr->priority;
618         hdr2->version = hdr->version;
619         hdr2->data_pre_padding_len = 0;
620         hdr2->data_off = hdr->data_off;
621         hdr2->ack_seq = cpu_to_le64(ack_seq);
622         hdr2->flags = 0;
623         hdr2->compat_version = hdr->compat_version;
624         hdr2->reserved = 0;
625 }
626
627 static int verify_control_crc(struct ceph_connection *con)
628 {
629         int ctrl_len = con->v2.in_desc.fd_lens[0];
630         u32 crc, expected_crc;
631
632         WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
633         WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
634
635         crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
636         expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
637         if (crc != expected_crc) {
638                 pr_err("bad control crc, calculated %u, expected %u\n",
639                        crc, expected_crc);
640                 return -EBADMSG;
641         }
642
643         return 0;
644 }
645
646 static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
647                                 u32 middle_crc, u32 data_crc)
648 {
649         if (front_len(con->in_msg)) {
650                 con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
651                                            front_len(con->in_msg));
652         } else {
653                 WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
654                 con->in_front_crc = -1;
655         }
656
657         if (middle_len(con->in_msg))
658                 con->in_middle_crc = crc32c(-1,
659                                             con->in_msg->middle->vec.iov_base,
660                                             middle_len(con->in_msg));
661         else if (data_len(con->in_msg))
662                 con->in_middle_crc = -1;
663         else
664                 con->in_middle_crc = 0;
665
666         if (!data_len(con->in_msg))
667                 con->in_data_crc = 0;
668
669         dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
670              con->in_front_crc, con->in_middle_crc, con->in_data_crc);
671
672         if (con->in_front_crc != front_crc) {
673                 pr_err("bad front crc, calculated %u, expected %u\n",
674                        con->in_front_crc, front_crc);
675                 return -EBADMSG;
676         }
677         if (con->in_middle_crc != middle_crc) {
678                 pr_err("bad middle crc, calculated %u, expected %u\n",
679                        con->in_middle_crc, middle_crc);
680                 return -EBADMSG;
681         }
682         if (con->in_data_crc != data_crc) {
683                 pr_err("bad data crc, calculated %u, expected %u\n",
684                        con->in_data_crc, data_crc);
685                 return -EBADMSG;
686         }
687
688         return 0;
689 }
690
691 static int setup_crypto(struct ceph_connection *con,
692                         const u8 *session_key, int session_key_len,
693                         const u8 *con_secret, int con_secret_len)
694 {
695         unsigned int noio_flag;
696         int ret;
697
698         dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
699              __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
700         WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
701
702         if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
703             con->v2.con_mode != CEPH_CON_MODE_SECURE) {
704                 pr_err("bad con_mode %d\n", con->v2.con_mode);
705                 return -EINVAL;
706         }
707
708         if (!session_key_len) {
709                 WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
710                 WARN_ON(con_secret_len);
711                 return 0;  /* auth_none */
712         }
713
714         noio_flag = memalloc_noio_save();
715         con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
716         memalloc_noio_restore(noio_flag);
717         if (IS_ERR(con->v2.hmac_tfm)) {
718                 ret = PTR_ERR(con->v2.hmac_tfm);
719                 con->v2.hmac_tfm = NULL;
720                 pr_err("failed to allocate hmac tfm context: %d\n", ret);
721                 return ret;
722         }
723
724         WARN_ON((unsigned long)session_key &
725                 crypto_shash_alignmask(con->v2.hmac_tfm));
726         ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
727                                   session_key_len);
728         if (ret) {
729                 pr_err("failed to set hmac key: %d\n", ret);
730                 return ret;
731         }
732
733         if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
734                 WARN_ON(con_secret_len);
735                 return 0;  /* auth_x, plain mode */
736         }
737
738         if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
739                 pr_err("con_secret too small %d\n", con_secret_len);
740                 return -EINVAL;
741         }
742
743         noio_flag = memalloc_noio_save();
744         con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
745         memalloc_noio_restore(noio_flag);
746         if (IS_ERR(con->v2.gcm_tfm)) {
747                 ret = PTR_ERR(con->v2.gcm_tfm);
748                 con->v2.gcm_tfm = NULL;
749                 pr_err("failed to allocate gcm tfm context: %d\n", ret);
750                 return ret;
751         }
752
753         WARN_ON((unsigned long)con_secret &
754                 crypto_aead_alignmask(con->v2.gcm_tfm));
755         ret = crypto_aead_setkey(con->v2.gcm_tfm, con_secret, CEPH_GCM_KEY_LEN);
756         if (ret) {
757                 pr_err("failed to set gcm key: %d\n", ret);
758                 return ret;
759         }
760
761         WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
762         ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
763         if (ret) {
764                 pr_err("failed to set gcm tag size: %d\n", ret);
765                 return ret;
766         }
767
768         con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
769         if (!con->v2.gcm_req) {
770                 pr_err("failed to allocate gcm request\n");
771                 return -ENOMEM;
772         }
773
774         crypto_init_wait(&con->v2.gcm_wait);
775         aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
776                                   crypto_req_done, &con->v2.gcm_wait);
777
778         memcpy(&con->v2.in_gcm_nonce, con_secret + CEPH_GCM_KEY_LEN,
779                CEPH_GCM_IV_LEN);
780         memcpy(&con->v2.out_gcm_nonce,
781                con_secret + CEPH_GCM_KEY_LEN + CEPH_GCM_IV_LEN,
782                CEPH_GCM_IV_LEN);
783         return 0;  /* auth_x, secure mode */
784 }
785
786 static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs,
787                        int kvec_cnt, u8 *hmac)
788 {
789         SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm);  /* tfm arg is ignored */
790         int ret;
791         int i;
792
793         dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
794              con->v2.hmac_tfm, kvec_cnt);
795
796         if (!con->v2.hmac_tfm) {
797                 memset(hmac, 0, SHA256_DIGEST_SIZE);
798                 return 0;  /* auth_none */
799         }
800
801         desc->tfm = con->v2.hmac_tfm;
802         ret = crypto_shash_init(desc);
803         if (ret)
804                 goto out;
805
806         for (i = 0; i < kvec_cnt; i++) {
807                 WARN_ON((unsigned long)kvecs[i].iov_base &
808                         crypto_shash_alignmask(con->v2.hmac_tfm));
809                 ret = crypto_shash_update(desc, kvecs[i].iov_base,
810                                           kvecs[i].iov_len);
811                 if (ret)
812                         goto out;
813         }
814
815         ret = crypto_shash_final(desc, hmac);
816
817 out:
818         shash_desc_zero(desc);
819         return ret;  /* auth_x, both plain and secure modes */
820 }
821
822 static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
823 {
824         u64 counter;
825
826         counter = le64_to_cpu(nonce->counter);
827         nonce->counter = cpu_to_le64(counter + 1);
828 }
829
830 static int gcm_crypt(struct ceph_connection *con, bool encrypt,
831                      struct scatterlist *src, struct scatterlist *dst,
832                      int src_len)
833 {
834         struct ceph_gcm_nonce *nonce;
835         int ret;
836
837         nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
838
839         aead_request_set_ad(con->v2.gcm_req, 0);  /* no AAD */
840         aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
841         ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
842                                         crypto_aead_decrypt(con->v2.gcm_req),
843                               &con->v2.gcm_wait);
844         if (ret)
845                 return ret;
846
847         gcm_inc_nonce(nonce);
848         return 0;
849 }
850
851 static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
852                         struct bio_vec *bv)
853 {
854         struct page *page;
855         size_t off, len;
856
857         WARN_ON(!cursor->total_resid);
858
859         /* skip zero-length data items */
860         while (!cursor->resid)
861                 ceph_msg_data_advance(cursor, 0);
862
863         /* get a piece of data, cursor isn't advanced */
864         page = ceph_msg_data_next(cursor, &off, &len);
865         bvec_set_page(bv, page, len, off);
866 }
867
868 static int calc_sg_cnt(void *buf, int buf_len)
869 {
870         int sg_cnt;
871
872         if (!buf_len)
873                 return 0;
874
875         sg_cnt = need_padding(buf_len) ? 1 : 0;
876         if (is_vmalloc_addr(buf)) {
877                 WARN_ON(offset_in_page(buf));
878                 sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
879         } else {
880                 sg_cnt++;
881         }
882
883         return sg_cnt;
884 }
885
886 static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
887 {
888         int data_len = cursor->total_resid;
889         struct bio_vec bv;
890         int sg_cnt;
891
892         if (!data_len)
893                 return 0;
894
895         sg_cnt = need_padding(data_len) ? 1 : 0;
896         do {
897                 get_bvec_at(cursor, &bv);
898                 sg_cnt++;
899
900                 ceph_msg_data_advance(cursor, bv.bv_len);
901         } while (cursor->total_resid);
902
903         return sg_cnt;
904 }
905
906 static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
907 {
908         void *end = buf + buf_len;
909         struct page *page;
910         int len;
911         void *p;
912
913         if (!buf_len)
914                 return;
915
916         if (is_vmalloc_addr(buf)) {
917                 p = buf;
918                 do {
919                         page = vmalloc_to_page(p);
920                         len = min_t(int, end - p, PAGE_SIZE);
921                         WARN_ON(!page || !len || offset_in_page(p));
922                         sg_set_page(*sg, page, len, 0);
923                         *sg = sg_next(*sg);
924                         p += len;
925                 } while (p != end);
926         } else {
927                 sg_set_buf(*sg, buf, buf_len);
928                 *sg = sg_next(*sg);
929         }
930
931         if (need_padding(buf_len)) {
932                 sg_set_buf(*sg, pad, padding_len(buf_len));
933                 *sg = sg_next(*sg);
934         }
935 }
936
937 static void init_sgs_cursor(struct scatterlist **sg,
938                             struct ceph_msg_data_cursor *cursor, u8 *pad)
939 {
940         int data_len = cursor->total_resid;
941         struct bio_vec bv;
942
943         if (!data_len)
944                 return;
945
946         do {
947                 get_bvec_at(cursor, &bv);
948                 sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
949                 *sg = sg_next(*sg);
950
951                 ceph_msg_data_advance(cursor, bv.bv_len);
952         } while (cursor->total_resid);
953
954         if (need_padding(data_len)) {
955                 sg_set_buf(*sg, pad, padding_len(data_len));
956                 *sg = sg_next(*sg);
957         }
958 }
959
960 static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
961                              u8 *front_pad, u8 *middle_pad, u8 *data_pad,
962                              void *epilogue, bool add_tag)
963 {
964         struct ceph_msg_data_cursor cursor;
965         struct scatterlist *cur_sg;
966         int sg_cnt;
967         int ret;
968
969         if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
970                 return 0;
971
972         sg_cnt = 1;  /* epilogue + [auth tag] */
973         if (front_len(msg))
974                 sg_cnt += calc_sg_cnt(msg->front.iov_base,
975                                       front_len(msg));
976         if (middle_len(msg))
977                 sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
978                                       middle_len(msg));
979         if (data_len(msg)) {
980                 ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
981                 sg_cnt += calc_sg_cnt_cursor(&cursor);
982         }
983
984         ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
985         if (ret)
986                 return ret;
987
988         cur_sg = sgt->sgl;
989         if (front_len(msg))
990                 init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
991                          front_pad);
992         if (middle_len(msg))
993                 init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
994                          middle_pad);
995         if (data_len(msg)) {
996                 ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
997                 init_sgs_cursor(&cur_sg, &cursor, data_pad);
998         }
999
1000         WARN_ON(!sg_is_last(cur_sg));
1001         sg_set_buf(cur_sg, epilogue,
1002                    CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
1003         return 0;
1004 }
1005
1006 static int decrypt_preamble(struct ceph_connection *con)
1007 {
1008         struct scatterlist sg;
1009
1010         sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
1011         return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
1012 }
1013
1014 static int decrypt_control_remainder(struct ceph_connection *con)
1015 {
1016         int ctrl_len = con->v2.in_desc.fd_lens[0];
1017         int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
1018         int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
1019         struct scatterlist sgs[2];
1020
1021         WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
1022         WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
1023
1024         sg_init_table(sgs, 2);
1025         sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
1026         sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
1027
1028         return gcm_crypt(con, false, sgs, sgs,
1029                          padded_len(rem_len) + CEPH_GCM_TAG_LEN);
1030 }
1031
1032 static int decrypt_tail(struct ceph_connection *con)
1033 {
1034         struct sg_table enc_sgt = {};
1035         struct sg_table sgt = {};
1036         int tail_len;
1037         int ret;
1038
1039         tail_len = tail_onwire_len(con->in_msg, true);
1040         ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages,
1041                                         con->v2.in_enc_page_cnt, 0, tail_len,
1042                                         GFP_NOIO);
1043         if (ret)
1044                 goto out;
1045
1046         ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
1047                         MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
1048                         con->v2.in_buf, true);
1049         if (ret)
1050                 goto out;
1051
1052         dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con,
1053              con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents);
1054         ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len);
1055         if (ret)
1056                 goto out;
1057
1058         WARN_ON(!con->v2.in_enc_page_cnt);
1059         ceph_release_page_vector(con->v2.in_enc_pages,
1060                                  con->v2.in_enc_page_cnt);
1061         con->v2.in_enc_pages = NULL;
1062         con->v2.in_enc_page_cnt = 0;
1063
1064 out:
1065         sg_free_table(&sgt);
1066         sg_free_table(&enc_sgt);
1067         return ret;
1068 }
1069
1070 static int prepare_banner(struct ceph_connection *con)
1071 {
1072         int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
1073         void *buf, *p;
1074
1075         buf = alloc_conn_buf(con, buf_len);
1076         if (!buf)
1077                 return -ENOMEM;
1078
1079         p = buf;
1080         ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
1081         ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
1082         ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
1083         ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
1084         WARN_ON(p != buf + buf_len);
1085
1086         add_out_kvec(con, buf, buf_len);
1087         add_out_sign_kvec(con, buf, buf_len);
1088         ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
1089         return 0;
1090 }
1091
1092 /*
1093  * base:
1094  *   preamble
1095  *   control body (ctrl_len bytes)
1096  *   space for control crc
1097  *
1098  * extdata (optional):
1099  *   control body (extdata_len bytes)
1100  *
1101  * Compute control crc and gather base and extdata into:
1102  *
1103  *   preamble
1104  *   control body (ctrl_len + extdata_len bytes)
1105  *   control crc
1106  *
1107  * Preamble should already be encoded at the start of base.
1108  */
1109 static void prepare_head_plain(struct ceph_connection *con, void *base,
1110                                int ctrl_len, void *extdata, int extdata_len,
1111                                bool to_be_signed)
1112 {
1113         int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
1114         void *crcp = base + base_len - CEPH_CRC_LEN;
1115         u32 crc;
1116
1117         crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
1118         if (extdata_len)
1119                 crc = crc32c(crc, extdata, extdata_len);
1120         put_unaligned_le32(crc, crcp);
1121
1122         if (!extdata_len) {
1123                 add_out_kvec(con, base, base_len);
1124                 if (to_be_signed)
1125                         add_out_sign_kvec(con, base, base_len);
1126                 return;
1127         }
1128
1129         add_out_kvec(con, base, crcp - base);
1130         add_out_kvec(con, extdata, extdata_len);
1131         add_out_kvec(con, crcp, CEPH_CRC_LEN);
1132         if (to_be_signed) {
1133                 add_out_sign_kvec(con, base, crcp - base);
1134                 add_out_sign_kvec(con, extdata, extdata_len);
1135                 add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
1136         }
1137 }
1138
1139 static int prepare_head_secure_small(struct ceph_connection *con,
1140                                      void *base, int ctrl_len)
1141 {
1142         struct scatterlist sg;
1143         int ret;
1144
1145         /* inline buffer padding? */
1146         if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
1147                 memset(CTRL_BODY(base) + ctrl_len, 0,
1148                        CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
1149
1150         sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
1151         ret = gcm_crypt(con, true, &sg, &sg,
1152                         CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
1153         if (ret)
1154                 return ret;
1155
1156         add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
1157         return 0;
1158 }
1159
1160 /*
1161  * base:
1162  *   preamble
1163  *   control body (ctrl_len bytes)
1164  *   space for padding, if needed
1165  *   space for control remainder auth tag
1166  *   space for preamble auth tag
1167  *
1168  * Encrypt preamble and the inline portion, then encrypt the remainder
1169  * and gather into:
1170  *
1171  *   preamble
1172  *   control body (48 bytes)
1173  *   preamble auth tag
1174  *   control body (ctrl_len - 48 bytes)
1175  *   zero padding, if needed
1176  *   control remainder auth tag
1177  *
1178  * Preamble should already be encoded at the start of base.
1179  */
1180 static int prepare_head_secure_big(struct ceph_connection *con,
1181                                    void *base, int ctrl_len)
1182 {
1183         int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
1184         void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
1185         void *rem_tag = rem + padded_len(rem_len);
1186         void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
1187         struct scatterlist sgs[2];
1188         int ret;
1189
1190         sg_init_table(sgs, 2);
1191         sg_set_buf(&sgs[0], base, rem - base);
1192         sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
1193         ret = gcm_crypt(con, true, sgs, sgs, rem - base);
1194         if (ret)
1195                 return ret;
1196
1197         /* control remainder padding? */
1198         if (need_padding(rem_len))
1199                 memset(rem + rem_len, 0, padding_len(rem_len));
1200
1201         sg_init_one(&sgs[0], rem, pmbl_tag - rem);
1202         ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
1203         if (ret)
1204                 return ret;
1205
1206         add_out_kvec(con, base, rem - base);
1207         add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
1208         add_out_kvec(con, rem, pmbl_tag - rem);
1209         return 0;
1210 }
1211
1212 static int __prepare_control(struct ceph_connection *con, int tag,
1213                              void *base, int ctrl_len, void *extdata,
1214                              int extdata_len, bool to_be_signed)
1215 {
1216         int total_len = ctrl_len + extdata_len;
1217         struct ceph_frame_desc desc;
1218         int ret;
1219
1220         dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
1221              total_len, ctrl_len, extdata_len);
1222
1223         /* extdata may be vmalloc'ed but not base */
1224         if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
1225                 return -EINVAL;
1226
1227         init_frame_desc(&desc, tag, &total_len, 1);
1228         encode_preamble(&desc, base);
1229
1230         if (con_secure(con)) {
1231                 if (WARN_ON(extdata_len || to_be_signed))
1232                         return -EINVAL;
1233
1234                 if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
1235                         /* fully inlined, inline buffer may need padding */
1236                         ret = prepare_head_secure_small(con, base, ctrl_len);
1237                 else
1238                         /* partially inlined, inline buffer is full */
1239                         ret = prepare_head_secure_big(con, base, ctrl_len);
1240                 if (ret)
1241                         return ret;
1242         } else {
1243                 prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
1244                                    to_be_signed);
1245         }
1246
1247         ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
1248         return 0;
1249 }
1250
1251 static int prepare_control(struct ceph_connection *con, int tag,
1252                            void *base, int ctrl_len)
1253 {
1254         return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
1255 }
1256
1257 static int prepare_hello(struct ceph_connection *con)
1258 {
1259         void *buf, *p;
1260         int ctrl_len;
1261
1262         ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
1263         buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
1264         if (!buf)
1265                 return -ENOMEM;
1266
1267         p = CTRL_BODY(buf);
1268         ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
1269         ceph_encode_entity_addr(&p, &con->peer_addr);
1270         WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
1271
1272         return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
1273                                  NULL, 0, true);
1274 }
1275
1276 /* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
1277 #define AUTH_BUF_LEN    (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
1278
1279 static int prepare_auth_request(struct ceph_connection *con)
1280 {
1281         void *authorizer, *authorizer_copy;
1282         int ctrl_len, authorizer_len;
1283         void *buf;
1284         int ret;
1285
1286         ctrl_len = AUTH_BUF_LEN;
1287         buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
1288         if (!buf)
1289                 return -ENOMEM;
1290
1291         mutex_unlock(&con->mutex);
1292         ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
1293                                          &authorizer, &authorizer_len);
1294         mutex_lock(&con->mutex);
1295         if (con->state != CEPH_CON_S_V2_HELLO) {
1296                 dout("%s con %p state changed to %d\n", __func__, con,
1297                      con->state);
1298                 return -EAGAIN;
1299         }
1300
1301         dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
1302         if (ret)
1303                 return ret;
1304
1305         authorizer_copy = alloc_conn_buf(con, authorizer_len);
1306         if (!authorizer_copy)
1307                 return -ENOMEM;
1308
1309         memcpy(authorizer_copy, authorizer, authorizer_len);
1310
1311         return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
1312                                  authorizer_copy, authorizer_len, true);
1313 }
1314
1315 static int prepare_auth_request_more(struct ceph_connection *con,
1316                                      void *reply, int reply_len)
1317 {
1318         int ctrl_len, authorizer_len;
1319         void *authorizer;
1320         void *buf;
1321         int ret;
1322
1323         ctrl_len = AUTH_BUF_LEN;
1324         buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
1325         if (!buf)
1326                 return -ENOMEM;
1327
1328         mutex_unlock(&con->mutex);
1329         ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
1330                                                CTRL_BODY(buf), &ctrl_len,
1331                                                &authorizer, &authorizer_len);
1332         mutex_lock(&con->mutex);
1333         if (con->state != CEPH_CON_S_V2_AUTH) {
1334                 dout("%s con %p state changed to %d\n", __func__, con,
1335                      con->state);
1336                 return -EAGAIN;
1337         }
1338
1339         dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
1340         if (ret)
1341                 return ret;
1342
1343         return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
1344                                  ctrl_len, authorizer, authorizer_len, true);
1345 }
1346
1347 static int prepare_auth_signature(struct ceph_connection *con)
1348 {
1349         void *buf;
1350         int ret;
1351
1352         buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE,
1353                                                   con_secure(con)));
1354         if (!buf)
1355                 return -ENOMEM;
1356
1357         ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
1358                           CTRL_BODY(buf));
1359         if (ret)
1360                 return ret;
1361
1362         return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
1363                                SHA256_DIGEST_SIZE);
1364 }
1365
1366 static int prepare_client_ident(struct ceph_connection *con)
1367 {
1368         struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
1369         struct ceph_client *client = from_msgr(con->msgr);
1370         u64 global_id = ceph_client_gid(client);
1371         void *buf, *p;
1372         int ctrl_len;
1373
1374         WARN_ON(con->v2.server_cookie);
1375         WARN_ON(con->v2.connect_seq);
1376         WARN_ON(con->v2.peer_global_seq);
1377
1378         if (!con->v2.client_cookie) {
1379                 do {
1380                         get_random_bytes(&con->v2.client_cookie,
1381                                          sizeof(con->v2.client_cookie));
1382                 } while (!con->v2.client_cookie);
1383                 dout("%s con %p generated cookie 0x%llx\n", __func__, con,
1384                      con->v2.client_cookie);
1385         } else {
1386                 dout("%s con %p cookie already set 0x%llx\n", __func__, con,
1387                      con->v2.client_cookie);
1388         }
1389
1390         dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
1391              __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
1392              ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
1393              global_id, con->v2.global_seq, client->supported_features,
1394              client->required_features, con->v2.client_cookie);
1395
1396         ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
1397                    ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
1398         buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
1399         if (!buf)
1400                 return -ENOMEM;
1401
1402         p = CTRL_BODY(buf);
1403         ceph_encode_8(&p, 2);  /* addrvec marker */
1404         ceph_encode_32(&p, 1);  /* addr_cnt */
1405         ceph_encode_entity_addr(&p, my_addr);
1406         ceph_encode_entity_addr(&p, &con->peer_addr);
1407         ceph_encode_64(&p, global_id);
1408         ceph_encode_64(&p, con->v2.global_seq);
1409         ceph_encode_64(&p, client->supported_features);
1410         ceph_encode_64(&p, client->required_features);
1411         ceph_encode_64(&p, 0);  /* flags */
1412         ceph_encode_64(&p, con->v2.client_cookie);
1413         WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
1414
1415         return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
1416 }
1417
1418 static int prepare_session_reconnect(struct ceph_connection *con)
1419 {
1420         struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
1421         void *buf, *p;
1422         int ctrl_len;
1423
1424         WARN_ON(!con->v2.client_cookie);
1425         WARN_ON(!con->v2.server_cookie);
1426         WARN_ON(!con->v2.connect_seq);
1427         WARN_ON(!con->v2.peer_global_seq);
1428
1429         dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
1430              __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
1431              con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
1432              con->v2.connect_seq, con->in_seq);
1433
1434         ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
1435         buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
1436         if (!buf)
1437                 return -ENOMEM;
1438
1439         p = CTRL_BODY(buf);
1440         ceph_encode_8(&p, 2);  /* entity_addrvec_t marker */
1441         ceph_encode_32(&p, 1);  /* my_addrs len */
1442         ceph_encode_entity_addr(&p, my_addr);
1443         ceph_encode_64(&p, con->v2.client_cookie);
1444         ceph_encode_64(&p, con->v2.server_cookie);
1445         ceph_encode_64(&p, con->v2.global_seq);
1446         ceph_encode_64(&p, con->v2.connect_seq);
1447         ceph_encode_64(&p, con->in_seq);
1448         WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
1449
1450         return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
1451 }
1452
1453 static int prepare_keepalive2(struct ceph_connection *con)
1454 {
1455         struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
1456         struct timespec64 now;
1457
1458         ktime_get_real_ts64(&now);
1459         dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec,
1460              now.tv_nsec);
1461
1462         ceph_encode_timespec64(ts, &now);
1463
1464         reset_out_kvecs(con);
1465         return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
1466                                sizeof(struct ceph_timespec));
1467 }
1468
1469 static int prepare_ack(struct ceph_connection *con)
1470 {
1471         void *p;
1472
1473         dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
1474              con->in_seq_acked, con->in_seq);
1475         con->in_seq_acked = con->in_seq;
1476
1477         p = CTRL_BODY(con->v2.out_buf);
1478         ceph_encode_64(&p, con->in_seq_acked);
1479
1480         reset_out_kvecs(con);
1481         return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
1482 }
1483
1484 static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
1485 {
1486         dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
1487              con->out_msg, aborted, con->v2.out_epil.front_crc,
1488              con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
1489
1490         encode_epilogue_plain(con, aborted);
1491         add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
1492 }
1493
1494 /*
1495  * For "used" empty segments, crc is -1.  For unused (trailing)
1496  * segments, crc is 0.
1497  */
1498 static void prepare_message_plain(struct ceph_connection *con)
1499 {
1500         struct ceph_msg *msg = con->out_msg;
1501
1502         prepare_head_plain(con, con->v2.out_buf,
1503                            sizeof(struct ceph_msg_header2), NULL, 0, false);
1504
1505         if (!front_len(msg) && !middle_len(msg)) {
1506                 if (!data_len(msg)) {
1507                         /*
1508                          * Empty message: once the head is written,
1509                          * we are done -- there is no epilogue.
1510                          */
1511                         con->v2.out_state = OUT_S_FINISH_MESSAGE;
1512                         return;
1513                 }
1514
1515                 con->v2.out_epil.front_crc = -1;
1516                 con->v2.out_epil.middle_crc = -1;
1517                 con->v2.out_state = OUT_S_QUEUE_DATA;
1518                 return;
1519         }
1520
1521         if (front_len(msg)) {
1522                 con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
1523                                                     front_len(msg));
1524                 add_out_kvec(con, msg->front.iov_base, front_len(msg));
1525         } else {
1526                 /* middle (at least) is there, checked above */
1527                 con->v2.out_epil.front_crc = -1;
1528         }
1529
1530         if (middle_len(msg)) {
1531                 con->v2.out_epil.middle_crc =
1532                         crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
1533                 add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
1534         } else {
1535                 con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
1536         }
1537
1538         if (data_len(msg)) {
1539                 con->v2.out_state = OUT_S_QUEUE_DATA;
1540         } else {
1541                 con->v2.out_epil.data_crc = 0;
1542                 prepare_epilogue_plain(con, false);
1543                 con->v2.out_state = OUT_S_FINISH_MESSAGE;
1544         }
1545 }
1546
1547 /*
1548  * Unfortunately the kernel crypto API doesn't support streaming
1549  * (piecewise) operation for AEAD algorithms, so we can't get away
1550  * with a fixed size buffer and a couple sgs.  Instead, we have to
1551  * allocate pages for the entire tail of the message (currently up
1552  * to ~32M) and two sgs arrays (up to ~256K each)...
1553  */
1554 static int prepare_message_secure(struct ceph_connection *con)
1555 {
1556         void *zerop = page_address(ceph_zero_page);
1557         struct sg_table enc_sgt = {};
1558         struct sg_table sgt = {};
1559         struct page **enc_pages;
1560         int enc_page_cnt;
1561         int tail_len;
1562         int ret;
1563
1564         ret = prepare_head_secure_small(con, con->v2.out_buf,
1565                                         sizeof(struct ceph_msg_header2));
1566         if (ret)
1567                 return ret;
1568
1569         tail_len = tail_onwire_len(con->out_msg, true);
1570         if (!tail_len) {
1571                 /*
1572                  * Empty message: once the head is written,
1573                  * we are done -- there is no epilogue.
1574                  */
1575                 con->v2.out_state = OUT_S_FINISH_MESSAGE;
1576                 return 0;
1577         }
1578
1579         encode_epilogue_secure(con, false);
1580         ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
1581                                 &con->v2.out_epil, false);
1582         if (ret)
1583                 goto out;
1584
1585         enc_page_cnt = calc_pages_for(0, tail_len);
1586         enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
1587         if (IS_ERR(enc_pages)) {
1588                 ret = PTR_ERR(enc_pages);
1589                 goto out;
1590         }
1591
1592         WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
1593         con->v2.out_enc_pages = enc_pages;
1594         con->v2.out_enc_page_cnt = enc_page_cnt;
1595         con->v2.out_enc_resid = tail_len;
1596         con->v2.out_enc_i = 0;
1597
1598         ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
1599                                         0, tail_len, GFP_NOIO);
1600         if (ret)
1601                 goto out;
1602
1603         ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
1604                         tail_len - CEPH_GCM_TAG_LEN);
1605         if (ret)
1606                 goto out;
1607
1608         dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
1609              con->out_msg, sgt.orig_nents, enc_page_cnt);
1610         con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
1611
1612 out:
1613         sg_free_table(&sgt);
1614         sg_free_table(&enc_sgt);
1615         return ret;
1616 }
1617
1618 static int prepare_message(struct ceph_connection *con)
1619 {
1620         int lens[] = {
1621                 sizeof(struct ceph_msg_header2),
1622                 front_len(con->out_msg),
1623                 middle_len(con->out_msg),
1624                 data_len(con->out_msg)
1625         };
1626         struct ceph_frame_desc desc;
1627         int ret;
1628
1629         dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
1630              con->out_msg, lens[0], lens[1], lens[2], lens[3]);
1631
1632         if (con->in_seq > con->in_seq_acked) {
1633                 dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
1634                      con->in_seq_acked, con->in_seq);
1635                 con->in_seq_acked = con->in_seq;
1636         }
1637
1638         reset_out_kvecs(con);
1639         init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
1640         encode_preamble(&desc, con->v2.out_buf);
1641         fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
1642                      con->in_seq_acked);
1643
1644         if (con_secure(con)) {
1645                 ret = prepare_message_secure(con);
1646                 if (ret)
1647                         return ret;
1648         } else {
1649                 prepare_message_plain(con);
1650         }
1651
1652         ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
1653         return 0;
1654 }
1655
1656 static int prepare_read_banner_prefix(struct ceph_connection *con)
1657 {
1658         void *buf;
1659
1660         buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
1661         if (!buf)
1662                 return -ENOMEM;
1663
1664         reset_in_kvecs(con);
1665         add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
1666         add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
1667         con->state = CEPH_CON_S_V2_BANNER_PREFIX;
1668         return 0;
1669 }
1670
1671 static int prepare_read_banner_payload(struct ceph_connection *con,
1672                                        int payload_len)
1673 {
1674         void *buf;
1675
1676         buf = alloc_conn_buf(con, payload_len);
1677         if (!buf)
1678                 return -ENOMEM;
1679
1680         reset_in_kvecs(con);
1681         add_in_kvec(con, buf, payload_len);
1682         add_in_sign_kvec(con, buf, payload_len);
1683         con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
1684         return 0;
1685 }
1686
1687 static void prepare_read_preamble(struct ceph_connection *con)
1688 {
1689         reset_in_kvecs(con);
1690         add_in_kvec(con, con->v2.in_buf,
1691                     con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
1692                                       CEPH_PREAMBLE_PLAIN_LEN);
1693         con->v2.in_state = IN_S_HANDLE_PREAMBLE;
1694 }
1695
1696 static int prepare_read_control(struct ceph_connection *con)
1697 {
1698         int ctrl_len = con->v2.in_desc.fd_lens[0];
1699         int head_len;
1700         void *buf;
1701
1702         reset_in_kvecs(con);
1703         if (con->state == CEPH_CON_S_V2_HELLO ||
1704             con->state == CEPH_CON_S_V2_AUTH) {
1705                 head_len = head_onwire_len(ctrl_len, false);
1706                 buf = alloc_conn_buf(con, head_len);
1707                 if (!buf)
1708                         return -ENOMEM;
1709
1710                 /* preserve preamble */
1711                 memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
1712
1713                 add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
1714                 add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
1715                 add_in_sign_kvec(con, buf, head_len);
1716         } else {
1717                 if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
1718                         buf = alloc_conn_buf(con, ctrl_len);
1719                         if (!buf)
1720                                 return -ENOMEM;
1721
1722                         add_in_kvec(con, buf, ctrl_len);
1723                 } else {
1724                         add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
1725                 }
1726                 add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
1727         }
1728         con->v2.in_state = IN_S_HANDLE_CONTROL;
1729         return 0;
1730 }
1731
1732 static int prepare_read_control_remainder(struct ceph_connection *con)
1733 {
1734         int ctrl_len = con->v2.in_desc.fd_lens[0];
1735         int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
1736         void *buf;
1737
1738         buf = alloc_conn_buf(con, ctrl_len);
1739         if (!buf)
1740                 return -ENOMEM;
1741
1742         memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
1743
1744         reset_in_kvecs(con);
1745         add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
1746         add_in_kvec(con, con->v2.in_buf,
1747                     padding_len(rem_len) + CEPH_GCM_TAG_LEN);
1748         con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
1749         return 0;
1750 }
1751
1752 static int prepare_read_data(struct ceph_connection *con)
1753 {
1754         struct bio_vec bv;
1755
1756         con->in_data_crc = -1;
1757         ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
1758                                   data_len(con->in_msg));
1759
1760         get_bvec_at(&con->v2.in_cursor, &bv);
1761         if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
1762                 if (unlikely(!con->bounce_page)) {
1763                         con->bounce_page = alloc_page(GFP_NOIO);
1764                         if (!con->bounce_page) {
1765                                 pr_err("failed to allocate bounce page\n");
1766                                 return -ENOMEM;
1767                         }
1768                 }
1769
1770                 bv.bv_page = con->bounce_page;
1771                 bv.bv_offset = 0;
1772         }
1773         set_in_bvec(con, &bv);
1774         con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
1775         return 0;
1776 }
1777
1778 static void prepare_read_data_cont(struct ceph_connection *con)
1779 {
1780         struct bio_vec bv;
1781
1782         if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
1783                 con->in_data_crc = crc32c(con->in_data_crc,
1784                                           page_address(con->bounce_page),
1785                                           con->v2.in_bvec.bv_len);
1786
1787                 get_bvec_at(&con->v2.in_cursor, &bv);
1788                 memcpy_to_page(bv.bv_page, bv.bv_offset,
1789                                page_address(con->bounce_page),
1790                                con->v2.in_bvec.bv_len);
1791         } else {
1792                 con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
1793                                                     con->v2.in_bvec.bv_page,
1794                                                     con->v2.in_bvec.bv_offset,
1795                                                     con->v2.in_bvec.bv_len);
1796         }
1797
1798         ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
1799         if (con->v2.in_cursor.total_resid) {
1800                 get_bvec_at(&con->v2.in_cursor, &bv);
1801                 if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
1802                         bv.bv_page = con->bounce_page;
1803                         bv.bv_offset = 0;
1804                 }
1805                 set_in_bvec(con, &bv);
1806                 WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
1807                 return;
1808         }
1809
1810         /*
1811          * We've read all data.  Prepare to read epilogue.
1812          */
1813         reset_in_kvecs(con);
1814         add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
1815         con->v2.in_state = IN_S_HANDLE_EPILOGUE;
1816 }
1817
1818 static int prepare_read_tail_plain(struct ceph_connection *con)
1819 {
1820         struct ceph_msg *msg = con->in_msg;
1821
1822         if (!front_len(msg) && !middle_len(msg)) {
1823                 WARN_ON(!data_len(msg));
1824                 return prepare_read_data(con);
1825         }
1826
1827         reset_in_kvecs(con);
1828         if (front_len(msg)) {
1829                 add_in_kvec(con, msg->front.iov_base, front_len(msg));
1830                 WARN_ON(msg->front.iov_len != front_len(msg));
1831         }
1832         if (middle_len(msg)) {
1833                 add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
1834                 WARN_ON(msg->middle->vec.iov_len != middle_len(msg));
1835         }
1836
1837         if (data_len(msg)) {
1838                 con->v2.in_state = IN_S_PREPARE_READ_DATA;
1839         } else {
1840                 add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
1841                 con->v2.in_state = IN_S_HANDLE_EPILOGUE;
1842         }
1843         return 0;
1844 }
1845
1846 static void prepare_read_enc_page(struct ceph_connection *con)
1847 {
1848         struct bio_vec bv;
1849
1850         dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i,
1851              con->v2.in_enc_resid);
1852         WARN_ON(!con->v2.in_enc_resid);
1853
1854         bvec_set_page(&bv, con->v2.in_enc_pages[con->v2.in_enc_i],
1855                       min(con->v2.in_enc_resid, (int)PAGE_SIZE), 0);
1856
1857         set_in_bvec(con, &bv);
1858         con->v2.in_enc_i++;
1859         con->v2.in_enc_resid -= bv.bv_len;
1860
1861         if (con->v2.in_enc_resid) {
1862                 con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE;
1863                 return;
1864         }
1865
1866         /*
1867          * We are set to read the last piece of ciphertext (ending
1868          * with epilogue) + auth tag.
1869          */
1870         WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt);
1871         con->v2.in_state = IN_S_HANDLE_EPILOGUE;
1872 }
1873
1874 static int prepare_read_tail_secure(struct ceph_connection *con)
1875 {
1876         struct page **enc_pages;
1877         int enc_page_cnt;
1878         int tail_len;
1879
1880         tail_len = tail_onwire_len(con->in_msg, true);
1881         WARN_ON(!tail_len);
1882
1883         enc_page_cnt = calc_pages_for(0, tail_len);
1884         enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
1885         if (IS_ERR(enc_pages))
1886                 return PTR_ERR(enc_pages);
1887
1888         WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt);
1889         con->v2.in_enc_pages = enc_pages;
1890         con->v2.in_enc_page_cnt = enc_page_cnt;
1891         con->v2.in_enc_resid = tail_len;
1892         con->v2.in_enc_i = 0;
1893
1894         prepare_read_enc_page(con);
1895         return 0;
1896 }
1897
1898 static void __finish_skip(struct ceph_connection *con)
1899 {
1900         con->in_seq++;
1901         prepare_read_preamble(con);
1902 }
1903
1904 static void prepare_skip_message(struct ceph_connection *con)
1905 {
1906         struct ceph_frame_desc *desc = &con->v2.in_desc;
1907         int tail_len;
1908
1909         dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
1910              desc->fd_lens[2], desc->fd_lens[3]);
1911
1912         tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
1913                                      desc->fd_lens[3], con_secure(con));
1914         if (!tail_len) {
1915                 __finish_skip(con);
1916         } else {
1917                 set_in_skip(con, tail_len);
1918                 con->v2.in_state = IN_S_FINISH_SKIP;
1919         }
1920 }
1921
1922 static int process_banner_prefix(struct ceph_connection *con)
1923 {
1924         int payload_len;
1925         void *p;
1926
1927         WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
1928
1929         p = con->v2.in_kvecs[0].iov_base;
1930         if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
1931                 if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
1932                         con->error_msg = "server is speaking msgr1 protocol";
1933                 else
1934                         con->error_msg = "protocol error, bad banner";
1935                 return -EINVAL;
1936         }
1937
1938         p += CEPH_BANNER_V2_LEN;
1939         payload_len = ceph_decode_16(&p);
1940         dout("%s con %p payload_len %d\n", __func__, con, payload_len);
1941
1942         return prepare_read_banner_payload(con, payload_len);
1943 }
1944
1945 static int process_banner_payload(struct ceph_connection *con)
1946 {
1947         void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
1948         u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
1949         u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
1950         u64 server_feat, server_req_feat;
1951         void *p;
1952         int ret;
1953
1954         p = con->v2.in_kvecs[0].iov_base;
1955         ceph_decode_64_safe(&p, end, server_feat, bad);
1956         ceph_decode_64_safe(&p, end, server_req_feat, bad);
1957
1958         dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
1959              __func__, con, server_feat, server_req_feat);
1960
1961         if (req_feat & ~server_feat) {
1962                 pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
1963                        server_feat, req_feat & ~server_feat);
1964                 con->error_msg = "missing required protocol features";
1965                 return -EINVAL;
1966         }
1967         if (server_req_feat & ~feat) {
1968                 pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
1969                        feat, server_req_feat & ~feat);
1970                 con->error_msg = "missing required protocol features";
1971                 return -EINVAL;
1972         }
1973
1974         /* no reset_out_kvecs() as our banner may still be pending */
1975         ret = prepare_hello(con);
1976         if (ret) {
1977                 pr_err("prepare_hello failed: %d\n", ret);
1978                 return ret;
1979         }
1980
1981         con->state = CEPH_CON_S_V2_HELLO;
1982         prepare_read_preamble(con);
1983         return 0;
1984
1985 bad:
1986         pr_err("failed to decode banner payload\n");
1987         return -EINVAL;
1988 }
1989
1990 static int process_hello(struct ceph_connection *con, void *p, void *end)
1991 {
1992         struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
1993         struct ceph_entity_addr addr_for_me;
1994         u8 entity_type;
1995         int ret;
1996
1997         if (con->state != CEPH_CON_S_V2_HELLO) {
1998                 con->error_msg = "protocol error, unexpected hello";
1999                 return -EINVAL;
2000         }
2001
2002         ceph_decode_8_safe(&p, end, entity_type, bad);
2003         ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
2004         if (ret) {
2005                 pr_err("failed to decode addr_for_me: %d\n", ret);
2006                 return ret;
2007         }
2008
2009         dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
2010              entity_type, ceph_pr_addr(&addr_for_me));
2011
2012         if (entity_type != con->peer_name.type) {
2013                 pr_err("bad peer type, want %d, got %d\n",
2014                        con->peer_name.type, entity_type);
2015                 con->error_msg = "wrong peer at address";
2016                 return -EINVAL;
2017         }
2018
2019         /*
2020          * Set our address to the address our first peer (i.e. monitor)
2021          * sees that we are connecting from.  If we are behind some sort
2022          * of NAT and want to be identified by some private (not NATed)
2023          * address, ip option should be used.
2024          */
2025         if (ceph_addr_is_blank(my_addr)) {
2026                 memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
2027                        sizeof(my_addr->in_addr));
2028                 ceph_addr_set_port(my_addr, 0);
2029                 dout("%s con %p set my addr %s, as seen by peer %s\n",
2030                      __func__, con, ceph_pr_addr(my_addr),
2031                      ceph_pr_addr(&con->peer_addr));
2032         } else {
2033                 dout("%s con %p my addr already set %s\n",
2034                      __func__, con, ceph_pr_addr(my_addr));
2035         }
2036
2037         WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
2038         WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
2039         WARN_ON(!my_addr->nonce);
2040
2041         /* no reset_out_kvecs() as our hello may still be pending */
2042         ret = prepare_auth_request(con);
2043         if (ret) {
2044                 if (ret != -EAGAIN)
2045                         pr_err("prepare_auth_request failed: %d\n", ret);
2046                 return ret;
2047         }
2048
2049         con->state = CEPH_CON_S_V2_AUTH;
2050         return 0;
2051
2052 bad:
2053         pr_err("failed to decode hello\n");
2054         return -EINVAL;
2055 }
2056
2057 static int process_auth_bad_method(struct ceph_connection *con,
2058                                    void *p, void *end)
2059 {
2060         int allowed_protos[8], allowed_modes[8];
2061         int allowed_proto_cnt, allowed_mode_cnt;
2062         int used_proto, result;
2063         int ret;
2064         int i;
2065
2066         if (con->state != CEPH_CON_S_V2_AUTH) {
2067                 con->error_msg = "protocol error, unexpected auth_bad_method";
2068                 return -EINVAL;
2069         }
2070
2071         ceph_decode_32_safe(&p, end, used_proto, bad);
2072         ceph_decode_32_safe(&p, end, result, bad);
2073         dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
2074              result);
2075
2076         ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
2077         if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
2078                 pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
2079                 return -EINVAL;
2080         }
2081         for (i = 0; i < allowed_proto_cnt; i++) {
2082                 ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
2083                 dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
2084                      i, allowed_protos[i]);
2085         }
2086
2087         ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
2088         if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
2089                 pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
2090                 return -EINVAL;
2091         }
2092         for (i = 0; i < allowed_mode_cnt; i++) {
2093                 ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
2094                 dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
2095                      i, allowed_modes[i]);
2096         }
2097
2098         mutex_unlock(&con->mutex);
2099         ret = con->ops->handle_auth_bad_method(con, used_proto, result,
2100                                                allowed_protos,
2101                                                allowed_proto_cnt,
2102                                                allowed_modes,
2103                                                allowed_mode_cnt);
2104         mutex_lock(&con->mutex);
2105         if (con->state != CEPH_CON_S_V2_AUTH) {
2106                 dout("%s con %p state changed to %d\n", __func__, con,
2107                      con->state);
2108                 return -EAGAIN;
2109         }
2110
2111         dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
2112         return ret;
2113
2114 bad:
2115         pr_err("failed to decode auth_bad_method\n");
2116         return -EINVAL;
2117 }
2118
2119 static int process_auth_reply_more(struct ceph_connection *con,
2120                                    void *p, void *end)
2121 {
2122         int payload_len;
2123         int ret;
2124
2125         if (con->state != CEPH_CON_S_V2_AUTH) {
2126                 con->error_msg = "protocol error, unexpected auth_reply_more";
2127                 return -EINVAL;
2128         }
2129
2130         ceph_decode_32_safe(&p, end, payload_len, bad);
2131         ceph_decode_need(&p, end, payload_len, bad);
2132
2133         dout("%s con %p payload_len %d\n", __func__, con, payload_len);
2134
2135         reset_out_kvecs(con);
2136         ret = prepare_auth_request_more(con, p, payload_len);
2137         if (ret) {
2138                 if (ret != -EAGAIN)
2139                         pr_err("prepare_auth_request_more failed: %d\n", ret);
2140                 return ret;
2141         }
2142
2143         return 0;
2144
2145 bad:
2146         pr_err("failed to decode auth_reply_more\n");
2147         return -EINVAL;
2148 }
2149
2150 /*
2151  * Align session_key and con_secret to avoid GFP_ATOMIC allocation
2152  * inside crypto_shash_setkey() and crypto_aead_setkey() called from
2153  * setup_crypto().  __aligned(16) isn't guaranteed to work for stack
2154  * objects, so do it by hand.
2155  */
2156 static int process_auth_done(struct ceph_connection *con, void *p, void *end)
2157 {
2158         u8 session_key_buf[CEPH_KEY_LEN + 16];
2159         u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16];
2160         u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16);
2161         u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16);
2162         int session_key_len, con_secret_len;
2163         int payload_len;
2164         u64 global_id;
2165         int ret;
2166
2167         if (con->state != CEPH_CON_S_V2_AUTH) {
2168                 con->error_msg = "protocol error, unexpected auth_done";
2169                 return -EINVAL;
2170         }
2171
2172         ceph_decode_64_safe(&p, end, global_id, bad);
2173         ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
2174         ceph_decode_32_safe(&p, end, payload_len, bad);
2175
2176         dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
2177              __func__, con, global_id, con->v2.con_mode, payload_len);
2178
2179         mutex_unlock(&con->mutex);
2180         session_key_len = 0;
2181         con_secret_len = 0;
2182         ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
2183                                          session_key, &session_key_len,
2184                                          con_secret, &con_secret_len);
2185         mutex_lock(&con->mutex);
2186         if (con->state != CEPH_CON_S_V2_AUTH) {
2187                 dout("%s con %p state changed to %d\n", __func__, con,
2188                      con->state);
2189                 ret = -EAGAIN;
2190                 goto out;
2191         }
2192
2193         dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
2194         if (ret)
2195                 goto out;
2196
2197         ret = setup_crypto(con, session_key, session_key_len, con_secret,
2198                            con_secret_len);
2199         if (ret)
2200                 goto out;
2201
2202         reset_out_kvecs(con);
2203         ret = prepare_auth_signature(con);
2204         if (ret) {
2205                 pr_err("prepare_auth_signature failed: %d\n", ret);
2206                 goto out;
2207         }
2208
2209         con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
2210
2211 out:
2212         memzero_explicit(session_key_buf, sizeof(session_key_buf));
2213         memzero_explicit(con_secret_buf, sizeof(con_secret_buf));
2214         return ret;
2215
2216 bad:
2217         pr_err("failed to decode auth_done\n");
2218         return -EINVAL;
2219 }
2220
2221 static int process_auth_signature(struct ceph_connection *con,
2222                                   void *p, void *end)
2223 {
2224         u8 hmac[SHA256_DIGEST_SIZE];
2225         int ret;
2226
2227         if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
2228                 con->error_msg = "protocol error, unexpected auth_signature";
2229                 return -EINVAL;
2230         }
2231
2232         ret = hmac_sha256(con, con->v2.out_sign_kvecs,
2233                           con->v2.out_sign_kvec_cnt, hmac);
2234         if (ret)
2235                 return ret;
2236
2237         ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
2238         if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
2239                 con->error_msg = "integrity error, bad auth signature";
2240                 return -EBADMSG;
2241         }
2242
2243         dout("%s con %p auth signature ok\n", __func__, con);
2244
2245         /* no reset_out_kvecs() as our auth_signature may still be pending */
2246         if (!con->v2.server_cookie) {
2247                 ret = prepare_client_ident(con);
2248                 if (ret) {
2249                         pr_err("prepare_client_ident failed: %d\n", ret);
2250                         return ret;
2251                 }
2252
2253                 con->state = CEPH_CON_S_V2_SESSION_CONNECT;
2254         } else {
2255                 ret = prepare_session_reconnect(con);
2256                 if (ret) {
2257                         pr_err("prepare_session_reconnect failed: %d\n", ret);
2258                         return ret;
2259                 }
2260
2261                 con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
2262         }
2263
2264         return 0;
2265
2266 bad:
2267         pr_err("failed to decode auth_signature\n");
2268         return -EINVAL;
2269 }
2270
2271 static int process_server_ident(struct ceph_connection *con,
2272                                 void *p, void *end)
2273 {
2274         struct ceph_client *client = from_msgr(con->msgr);
2275         u64 features, required_features;
2276         struct ceph_entity_addr addr;
2277         u64 global_seq;
2278         u64 global_id;
2279         u64 cookie;
2280         u64 flags;
2281         int ret;
2282
2283         if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
2284                 con->error_msg = "protocol error, unexpected server_ident";
2285                 return -EINVAL;
2286         }
2287
2288         ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
2289         if (ret) {
2290                 pr_err("failed to decode server addrs: %d\n", ret);
2291                 return ret;
2292         }
2293
2294         ceph_decode_64_safe(&p, end, global_id, bad);
2295         ceph_decode_64_safe(&p, end, global_seq, bad);
2296         ceph_decode_64_safe(&p, end, features, bad);
2297         ceph_decode_64_safe(&p, end, required_features, bad);
2298         ceph_decode_64_safe(&p, end, flags, bad);
2299         ceph_decode_64_safe(&p, end, cookie, bad);
2300
2301         dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
2302              __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
2303              global_id, global_seq, features, required_features, flags, cookie);
2304
2305         /* is this who we intended to talk to? */
2306         if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
2307                 pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
2308                        ceph_pr_addr(&con->peer_addr),
2309                        le32_to_cpu(con->peer_addr.nonce),
2310                        ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
2311                 con->error_msg = "wrong peer at address";
2312                 return -EINVAL;
2313         }
2314
2315         if (client->required_features & ~features) {
2316                 pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
2317                        features, client->required_features & ~features);
2318                 con->error_msg = "missing required protocol features";
2319                 return -EINVAL;
2320         }
2321
2322         /*
2323          * Both name->type and name->num are set in ceph_con_open() but
2324          * name->num may be bogus in the initial monmap.  name->type is
2325          * verified in handle_hello().
2326          */
2327         WARN_ON(!con->peer_name.type);
2328         con->peer_name.num = cpu_to_le64(global_id);
2329         con->v2.peer_global_seq = global_seq;
2330         con->peer_features = features;
2331         WARN_ON(required_features & ~client->supported_features);
2332         con->v2.server_cookie = cookie;
2333
2334         if (flags & CEPH_MSG_CONNECT_LOSSY) {
2335                 ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
2336                 WARN_ON(con->v2.server_cookie);
2337         } else {
2338                 WARN_ON(!con->v2.server_cookie);
2339         }
2340
2341         clear_in_sign_kvecs(con);
2342         clear_out_sign_kvecs(con);
2343         free_conn_bufs(con);
2344         con->delay = 0;  /* reset backoff memory */
2345
2346         con->state = CEPH_CON_S_OPEN;
2347         con->v2.out_state = OUT_S_GET_NEXT;
2348         return 0;
2349
2350 bad:
2351         pr_err("failed to decode server_ident\n");
2352         return -EINVAL;
2353 }
2354
2355 static int process_ident_missing_features(struct ceph_connection *con,
2356                                           void *p, void *end)
2357 {
2358         struct ceph_client *client = from_msgr(con->msgr);
2359         u64 missing_features;
2360
2361         if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
2362                 con->error_msg = "protocol error, unexpected ident_missing_features";
2363                 return -EINVAL;
2364         }
2365
2366         ceph_decode_64_safe(&p, end, missing_features, bad);
2367         pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
2368                client->supported_features, missing_features);
2369         con->error_msg = "missing required protocol features";
2370         return -EINVAL;
2371
2372 bad:
2373         pr_err("failed to decode ident_missing_features\n");
2374         return -EINVAL;
2375 }
2376
2377 static int process_session_reconnect_ok(struct ceph_connection *con,
2378                                         void *p, void *end)
2379 {
2380         u64 seq;
2381
2382         if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
2383                 con->error_msg = "protocol error, unexpected session_reconnect_ok";
2384                 return -EINVAL;
2385         }
2386
2387         ceph_decode_64_safe(&p, end, seq, bad);
2388
2389         dout("%s con %p seq %llu\n", __func__, con, seq);
2390         ceph_con_discard_requeued(con, seq);
2391
2392         clear_in_sign_kvecs(con);
2393         clear_out_sign_kvecs(con);
2394         free_conn_bufs(con);
2395         con->delay = 0;  /* reset backoff memory */
2396
2397         con->state = CEPH_CON_S_OPEN;
2398         con->v2.out_state = OUT_S_GET_NEXT;
2399         return 0;
2400
2401 bad:
2402         pr_err("failed to decode session_reconnect_ok\n");
2403         return -EINVAL;
2404 }
2405
2406 static int process_session_retry(struct ceph_connection *con,
2407                                  void *p, void *end)
2408 {
2409         u64 connect_seq;
2410         int ret;
2411
2412         if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
2413                 con->error_msg = "protocol error, unexpected session_retry";
2414                 return -EINVAL;
2415         }
2416
2417         ceph_decode_64_safe(&p, end, connect_seq, bad);
2418
2419         dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
2420         WARN_ON(connect_seq <= con->v2.connect_seq);
2421         con->v2.connect_seq = connect_seq + 1;
2422
2423         free_conn_bufs(con);
2424
2425         reset_out_kvecs(con);
2426         ret = prepare_session_reconnect(con);
2427         if (ret) {
2428                 pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
2429                 return ret;
2430         }
2431
2432         return 0;
2433
2434 bad:
2435         pr_err("failed to decode session_retry\n");
2436         return -EINVAL;
2437 }
2438
2439 static int process_session_retry_global(struct ceph_connection *con,
2440                                         void *p, void *end)
2441 {
2442         u64 global_seq;
2443         int ret;
2444
2445         if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
2446                 con->error_msg = "protocol error, unexpected session_retry_global";
2447                 return -EINVAL;
2448         }
2449
2450         ceph_decode_64_safe(&p, end, global_seq, bad);
2451
2452         dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
2453         WARN_ON(global_seq <= con->v2.global_seq);
2454         con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
2455
2456         free_conn_bufs(con);
2457
2458         reset_out_kvecs(con);
2459         ret = prepare_session_reconnect(con);
2460         if (ret) {
2461                 pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
2462                 return ret;
2463         }
2464
2465         return 0;
2466
2467 bad:
2468         pr_err("failed to decode session_retry_global\n");
2469         return -EINVAL;
2470 }
2471
2472 static int process_session_reset(struct ceph_connection *con,
2473                                  void *p, void *end)
2474 {
2475         bool full;
2476         int ret;
2477
2478         if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
2479                 con->error_msg = "protocol error, unexpected session_reset";
2480                 return -EINVAL;
2481         }
2482
2483         ceph_decode_8_safe(&p, end, full, bad);
2484         if (!full) {
2485                 con->error_msg = "protocol error, bad session_reset";
2486                 return -EINVAL;
2487         }
2488
2489         pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
2490                 ceph_pr_addr(&con->peer_addr));
2491         ceph_con_reset_session(con);
2492
2493         mutex_unlock(&con->mutex);
2494         if (con->ops->peer_reset)
2495                 con->ops->peer_reset(con);
2496         mutex_lock(&con->mutex);
2497         if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
2498                 dout("%s con %p state changed to %d\n", __func__, con,
2499                      con->state);
2500                 return -EAGAIN;
2501         }
2502
2503         free_conn_bufs(con);
2504
2505         reset_out_kvecs(con);
2506         ret = prepare_client_ident(con);
2507         if (ret) {
2508                 pr_err("prepare_client_ident (rst) failed: %d\n", ret);
2509                 return ret;
2510         }
2511
2512         con->state = CEPH_CON_S_V2_SESSION_CONNECT;
2513         return 0;
2514
2515 bad:
2516         pr_err("failed to decode session_reset\n");
2517         return -EINVAL;
2518 }
2519
2520 static int process_keepalive2_ack(struct ceph_connection *con,
2521                                   void *p, void *end)
2522 {
2523         if (con->state != CEPH_CON_S_OPEN) {
2524                 con->error_msg = "protocol error, unexpected keepalive2_ack";
2525                 return -EINVAL;
2526         }
2527
2528         ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
2529         ceph_decode_timespec64(&con->last_keepalive_ack, p);
2530
2531         dout("%s con %p timestamp %lld.%09ld\n", __func__, con,
2532              con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec);
2533
2534         return 0;
2535
2536 bad:
2537         pr_err("failed to decode keepalive2_ack\n");
2538         return -EINVAL;
2539 }
2540
2541 static int process_ack(struct ceph_connection *con, void *p, void *end)
2542 {
2543         u64 seq;
2544
2545         if (con->state != CEPH_CON_S_OPEN) {
2546                 con->error_msg = "protocol error, unexpected ack";
2547                 return -EINVAL;
2548         }
2549
2550         ceph_decode_64_safe(&p, end, seq, bad);
2551
2552         dout("%s con %p seq %llu\n", __func__, con, seq);
2553         ceph_con_discard_sent(con, seq);
2554         return 0;
2555
2556 bad:
2557         pr_err("failed to decode ack\n");
2558         return -EINVAL;
2559 }
2560
2561 static int process_control(struct ceph_connection *con, void *p, void *end)
2562 {
2563         int tag = con->v2.in_desc.fd_tag;
2564         int ret;
2565
2566         dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
2567
2568         switch (tag) {
2569         case FRAME_TAG_HELLO:
2570                 ret = process_hello(con, p, end);
2571                 break;
2572         case FRAME_TAG_AUTH_BAD_METHOD:
2573                 ret = process_auth_bad_method(con, p, end);
2574                 break;
2575         case FRAME_TAG_AUTH_REPLY_MORE:
2576                 ret = process_auth_reply_more(con, p, end);
2577                 break;
2578         case FRAME_TAG_AUTH_DONE:
2579                 ret = process_auth_done(con, p, end);
2580                 break;
2581         case FRAME_TAG_AUTH_SIGNATURE:
2582                 ret = process_auth_signature(con, p, end);
2583                 break;
2584         case FRAME_TAG_SERVER_IDENT:
2585                 ret = process_server_ident(con, p, end);
2586                 break;
2587         case FRAME_TAG_IDENT_MISSING_FEATURES:
2588                 ret = process_ident_missing_features(con, p, end);
2589                 break;
2590         case FRAME_TAG_SESSION_RECONNECT_OK:
2591                 ret = process_session_reconnect_ok(con, p, end);
2592                 break;
2593         case FRAME_TAG_SESSION_RETRY:
2594                 ret = process_session_retry(con, p, end);
2595                 break;
2596         case FRAME_TAG_SESSION_RETRY_GLOBAL:
2597                 ret = process_session_retry_global(con, p, end);
2598                 break;
2599         case FRAME_TAG_SESSION_RESET:
2600                 ret = process_session_reset(con, p, end);
2601                 break;
2602         case FRAME_TAG_KEEPALIVE2_ACK:
2603                 ret = process_keepalive2_ack(con, p, end);
2604                 break;
2605         case FRAME_TAG_ACK:
2606                 ret = process_ack(con, p, end);
2607                 break;
2608         default:
2609                 pr_err("bad tag %d\n", tag);
2610                 con->error_msg = "protocol error, bad tag";
2611                 return -EINVAL;
2612         }
2613         if (ret) {
2614                 dout("%s con %p error %d\n", __func__, con, ret);
2615                 return ret;
2616         }
2617
2618         prepare_read_preamble(con);
2619         return 0;
2620 }
2621
2622 /*
2623  * Return:
2624  *   1 - con->in_msg set, read message
2625  *   0 - skip message
2626  *  <0 - error
2627  */
2628 static int process_message_header(struct ceph_connection *con,
2629                                   void *p, void *end)
2630 {
2631         struct ceph_frame_desc *desc = &con->v2.in_desc;
2632         struct ceph_msg_header2 *hdr2 = p;
2633         struct ceph_msg_header hdr;
2634         int skip;
2635         int ret;
2636         u64 seq;
2637
2638         /* verify seq# */
2639         seq = le64_to_cpu(hdr2->seq);
2640         if ((s64)seq - (s64)con->in_seq < 1) {
2641                 pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
2642                         ENTITY_NAME(con->peer_name),
2643                         ceph_pr_addr(&con->peer_addr),
2644                         seq, con->in_seq + 1);
2645                 return 0;
2646         }
2647         if ((s64)seq - (s64)con->in_seq > 1) {
2648                 pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
2649                 con->error_msg = "bad message sequence # for incoming message";
2650                 return -EBADE;
2651         }
2652
2653         ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
2654
2655         fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
2656                     desc->fd_lens[3], &con->peer_name);
2657         ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
2658         if (ret)
2659                 return ret;
2660
2661         WARN_ON(!con->in_msg ^ skip);
2662         if (skip)
2663                 return 0;
2664
2665         WARN_ON(!con->in_msg);
2666         WARN_ON(con->in_msg->con != con);
2667         return 1;
2668 }
2669
2670 static int process_message(struct ceph_connection *con)
2671 {
2672         ceph_con_process_message(con);
2673
2674         /*
2675          * We could have been closed by ceph_con_close() because
2676          * ceph_con_process_message() temporarily drops con->mutex.
2677          */
2678         if (con->state != CEPH_CON_S_OPEN) {
2679                 dout("%s con %p state changed to %d\n", __func__, con,
2680                      con->state);
2681                 return -EAGAIN;
2682         }
2683
2684         prepare_read_preamble(con);
2685         return 0;
2686 }
2687
2688 static int __handle_control(struct ceph_connection *con, void *p)
2689 {
2690         void *end = p + con->v2.in_desc.fd_lens[0];
2691         struct ceph_msg *msg;
2692         int ret;
2693
2694         if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
2695                 return process_control(con, p, end);
2696
2697         ret = process_message_header(con, p, end);
2698         if (ret < 0)
2699                 return ret;
2700         if (ret == 0) {
2701                 prepare_skip_message(con);
2702                 return 0;
2703         }
2704
2705         msg = con->in_msg;  /* set in process_message_header() */
2706         if (front_len(msg)) {
2707                 WARN_ON(front_len(msg) > msg->front_alloc_len);
2708                 msg->front.iov_len = front_len(msg);
2709         } else {
2710                 msg->front.iov_len = 0;
2711         }
2712         if (middle_len(msg)) {
2713                 WARN_ON(middle_len(msg) > msg->middle->alloc_len);
2714                 msg->middle->vec.iov_len = middle_len(msg);
2715         } else if (msg->middle) {
2716                 msg->middle->vec.iov_len = 0;
2717         }
2718
2719         if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
2720                 return process_message(con);
2721
2722         if (con_secure(con))
2723                 return prepare_read_tail_secure(con);
2724
2725         return prepare_read_tail_plain(con);
2726 }
2727
2728 static int handle_preamble(struct ceph_connection *con)
2729 {
2730         struct ceph_frame_desc *desc = &con->v2.in_desc;
2731         int ret;
2732
2733         if (con_secure(con)) {
2734                 ret = decrypt_preamble(con);
2735                 if (ret) {
2736                         if (ret == -EBADMSG)
2737                                 con->error_msg = "integrity error, bad preamble auth tag";
2738                         return ret;
2739                 }
2740         }
2741
2742         ret = decode_preamble(con->v2.in_buf, desc);
2743         if (ret) {
2744                 if (ret == -EBADMSG)
2745                         con->error_msg = "integrity error, bad crc";
2746                 else
2747                         con->error_msg = "protocol error, bad preamble";
2748                 return ret;
2749         }
2750
2751         dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
2752              con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
2753              desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
2754
2755         if (!con_secure(con))
2756                 return prepare_read_control(con);
2757
2758         if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
2759                 return prepare_read_control_remainder(con);
2760
2761         return __handle_control(con, CTRL_BODY(con->v2.in_buf));
2762 }
2763
2764 static int handle_control(struct ceph_connection *con)
2765 {
2766         int ctrl_len = con->v2.in_desc.fd_lens[0];
2767         void *buf;
2768         int ret;
2769
2770         WARN_ON(con_secure(con));
2771
2772         ret = verify_control_crc(con);
2773         if (ret) {
2774                 con->error_msg = "integrity error, bad crc";
2775                 return ret;
2776         }
2777
2778         if (con->state == CEPH_CON_S_V2_AUTH) {
2779                 buf = alloc_conn_buf(con, ctrl_len);
2780                 if (!buf)
2781                         return -ENOMEM;
2782
2783                 memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
2784                 return __handle_control(con, buf);
2785         }
2786
2787         return __handle_control(con, con->v2.in_kvecs[0].iov_base);
2788 }
2789
2790 static int handle_control_remainder(struct ceph_connection *con)
2791 {
2792         int ret;
2793
2794         WARN_ON(!con_secure(con));
2795
2796         ret = decrypt_control_remainder(con);
2797         if (ret) {
2798                 if (ret == -EBADMSG)
2799                         con->error_msg = "integrity error, bad control remainder auth tag";
2800                 return ret;
2801         }
2802
2803         return __handle_control(con, con->v2.in_kvecs[0].iov_base -
2804                                      CEPH_PREAMBLE_INLINE_LEN);
2805 }
2806
2807 static int handle_epilogue(struct ceph_connection *con)
2808 {
2809         u32 front_crc, middle_crc, data_crc;
2810         int ret;
2811
2812         if (con_secure(con)) {
2813                 ret = decrypt_tail(con);
2814                 if (ret) {
2815                         if (ret == -EBADMSG)
2816                                 con->error_msg = "integrity error, bad epilogue auth tag";
2817                         return ret;
2818                 }
2819
2820                 /* just late_status */
2821                 ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
2822                 if (ret) {
2823                         con->error_msg = "protocol error, bad epilogue";
2824                         return ret;
2825                 }
2826         } else {
2827                 ret = decode_epilogue(con->v2.in_buf, &front_crc,
2828                                       &middle_crc, &data_crc);
2829                 if (ret) {
2830                         con->error_msg = "protocol error, bad epilogue";
2831                         return ret;
2832                 }
2833
2834                 ret = verify_epilogue_crcs(con, front_crc, middle_crc,
2835                                            data_crc);
2836                 if (ret) {
2837                         con->error_msg = "integrity error, bad crc";
2838                         return ret;
2839                 }
2840         }
2841
2842         return process_message(con);
2843 }
2844
2845 static void finish_skip(struct ceph_connection *con)
2846 {
2847         dout("%s con %p\n", __func__, con);
2848
2849         if (con_secure(con))
2850                 gcm_inc_nonce(&con->v2.in_gcm_nonce);
2851
2852         __finish_skip(con);
2853 }
2854
2855 static int populate_in_iter(struct ceph_connection *con)
2856 {
2857         int ret;
2858
2859         dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
2860              con->v2.in_state);
2861         WARN_ON(iov_iter_count(&con->v2.in_iter));
2862
2863         if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
2864                 ret = process_banner_prefix(con);
2865         } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
2866                 ret = process_banner_payload(con);
2867         } else if ((con->state >= CEPH_CON_S_V2_HELLO &&
2868                     con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
2869                    con->state == CEPH_CON_S_OPEN) {
2870                 switch (con->v2.in_state) {
2871                 case IN_S_HANDLE_PREAMBLE:
2872                         ret = handle_preamble(con);
2873                         break;
2874                 case IN_S_HANDLE_CONTROL:
2875                         ret = handle_control(con);
2876                         break;
2877                 case IN_S_HANDLE_CONTROL_REMAINDER:
2878                         ret = handle_control_remainder(con);
2879                         break;
2880                 case IN_S_PREPARE_READ_DATA:
2881                         ret = prepare_read_data(con);
2882                         break;
2883                 case IN_S_PREPARE_READ_DATA_CONT:
2884                         prepare_read_data_cont(con);
2885                         ret = 0;
2886                         break;
2887                 case IN_S_PREPARE_READ_ENC_PAGE:
2888                         prepare_read_enc_page(con);
2889                         ret = 0;
2890                         break;
2891                 case IN_S_HANDLE_EPILOGUE:
2892                         ret = handle_epilogue(con);
2893                         break;
2894                 case IN_S_FINISH_SKIP:
2895                         finish_skip(con);
2896                         ret = 0;
2897                         break;
2898                 default:
2899                         WARN(1, "bad in_state %d", con->v2.in_state);
2900                         return -EINVAL;
2901                 }
2902         } else {
2903                 WARN(1, "bad state %d", con->state);
2904                 return -EINVAL;
2905         }
2906         if (ret) {
2907                 dout("%s con %p error %d\n", __func__, con, ret);
2908                 return ret;
2909         }
2910
2911         if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
2912                 return -ENODATA;
2913         dout("%s con %p populated %zu\n", __func__, con,
2914              iov_iter_count(&con->v2.in_iter));
2915         return 1;
2916 }
2917
2918 int ceph_con_v2_try_read(struct ceph_connection *con)
2919 {
2920         int ret;
2921
2922         dout("%s con %p state %d need %zu\n", __func__, con, con->state,
2923              iov_iter_count(&con->v2.in_iter));
2924
2925         if (con->state == CEPH_CON_S_PREOPEN)
2926                 return 0;
2927
2928         /*
2929          * We should always have something pending here.  If not,
2930          * avoid calling populate_in_iter() as if we read something
2931          * (ceph_tcp_recv() would immediately return 1).
2932          */
2933         if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
2934                 return -ENODATA;
2935
2936         for (;;) {
2937                 ret = ceph_tcp_recv(con);
2938                 if (ret <= 0)
2939                         return ret;
2940
2941                 ret = populate_in_iter(con);
2942                 if (ret <= 0) {
2943                         if (ret && ret != -EAGAIN && !con->error_msg)
2944                                 con->error_msg = "read processing error";
2945                         return ret;
2946                 }
2947         }
2948 }
2949
2950 static void queue_data(struct ceph_connection *con)
2951 {
2952         struct bio_vec bv;
2953
2954         con->v2.out_epil.data_crc = -1;
2955         ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
2956                                   data_len(con->out_msg));
2957
2958         get_bvec_at(&con->v2.out_cursor, &bv);
2959         set_out_bvec(con, &bv, true);
2960         con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
2961 }
2962
2963 static void queue_data_cont(struct ceph_connection *con)
2964 {
2965         struct bio_vec bv;
2966
2967         con->v2.out_epil.data_crc = ceph_crc32c_page(
2968                 con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
2969                 con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
2970
2971         ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
2972         if (con->v2.out_cursor.total_resid) {
2973                 get_bvec_at(&con->v2.out_cursor, &bv);
2974                 set_out_bvec(con, &bv, true);
2975                 WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
2976                 return;
2977         }
2978
2979         /*
2980          * We've written all data.  Queue epilogue.  Once it's written,
2981          * we are done.
2982          */
2983         reset_out_kvecs(con);
2984         prepare_epilogue_plain(con, false);
2985         con->v2.out_state = OUT_S_FINISH_MESSAGE;
2986 }
2987
2988 static void queue_enc_page(struct ceph_connection *con)
2989 {
2990         struct bio_vec bv;
2991
2992         dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
2993              con->v2.out_enc_resid);
2994         WARN_ON(!con->v2.out_enc_resid);
2995
2996         bvec_set_page(&bv, con->v2.out_enc_pages[con->v2.out_enc_i],
2997                       min(con->v2.out_enc_resid, (int)PAGE_SIZE), 0);
2998
2999         set_out_bvec(con, &bv, false);
3000         con->v2.out_enc_i++;
3001         con->v2.out_enc_resid -= bv.bv_len;
3002
3003         if (con->v2.out_enc_resid) {
3004                 WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
3005                 return;
3006         }
3007
3008         /*
3009          * We've queued the last piece of ciphertext (ending with
3010          * epilogue) + auth tag.  Once it's written, we are done.
3011          */
3012         WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
3013         con->v2.out_state = OUT_S_FINISH_MESSAGE;
3014 }
3015
3016 static void queue_zeros(struct ceph_connection *con)
3017 {
3018         dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
3019
3020         if (con->v2.out_zero) {
3021                 set_out_bvec_zero(con);
3022                 con->v2.out_zero -= con->v2.out_bvec.bv_len;
3023                 con->v2.out_state = OUT_S_QUEUE_ZEROS;
3024                 return;
3025         }
3026
3027         /*
3028          * We've zero-filled everything up to epilogue.  Queue epilogue
3029          * with late_status set to ABORTED and crcs adjusted for zeros.
3030          * Once it's written, we are done patching up for the revoke.
3031          */
3032         reset_out_kvecs(con);
3033         prepare_epilogue_plain(con, true);
3034         con->v2.out_state = OUT_S_FINISH_MESSAGE;
3035 }
3036
3037 static void finish_message(struct ceph_connection *con)
3038 {
3039         dout("%s con %p msg %p\n", __func__, con, con->out_msg);
3040
3041         /* we end up here both plain and secure modes */
3042         if (con->v2.out_enc_pages) {
3043                 WARN_ON(!con->v2.out_enc_page_cnt);
3044                 ceph_release_page_vector(con->v2.out_enc_pages,
3045                                          con->v2.out_enc_page_cnt);
3046                 con->v2.out_enc_pages = NULL;
3047                 con->v2.out_enc_page_cnt = 0;
3048         }
3049         /* message may have been revoked */
3050         if (con->out_msg) {
3051                 ceph_msg_put(con->out_msg);
3052                 con->out_msg = NULL;
3053         }
3054
3055         con->v2.out_state = OUT_S_GET_NEXT;
3056 }
3057
3058 static int populate_out_iter(struct ceph_connection *con)
3059 {
3060         int ret;
3061
3062         dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
3063              con->v2.out_state);
3064         WARN_ON(iov_iter_count(&con->v2.out_iter));
3065
3066         if (con->state != CEPH_CON_S_OPEN) {
3067                 WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
3068                         con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
3069                 goto nothing_pending;
3070         }
3071
3072         switch (con->v2.out_state) {
3073         case OUT_S_QUEUE_DATA:
3074                 WARN_ON(!con->out_msg);
3075                 queue_data(con);
3076                 goto populated;
3077         case OUT_S_QUEUE_DATA_CONT:
3078                 WARN_ON(!con->out_msg);
3079                 queue_data_cont(con);
3080                 goto populated;
3081         case OUT_S_QUEUE_ENC_PAGE:
3082                 queue_enc_page(con);
3083                 goto populated;
3084         case OUT_S_QUEUE_ZEROS:
3085                 WARN_ON(con->out_msg);  /* revoked */
3086                 queue_zeros(con);
3087                 goto populated;
3088         case OUT_S_FINISH_MESSAGE:
3089                 finish_message(con);
3090                 break;
3091         case OUT_S_GET_NEXT:
3092                 break;
3093         default:
3094                 WARN(1, "bad out_state %d", con->v2.out_state);
3095                 return -EINVAL;
3096         }
3097
3098         WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
3099         if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
3100                 ret = prepare_keepalive2(con);
3101                 if (ret) {
3102                         pr_err("prepare_keepalive2 failed: %d\n", ret);
3103                         return ret;
3104                 }
3105         } else if (!list_empty(&con->out_queue)) {
3106                 ceph_con_get_out_msg(con);
3107                 ret = prepare_message(con);
3108                 if (ret) {
3109                         pr_err("prepare_message failed: %d\n", ret);
3110                         return ret;
3111                 }
3112         } else if (con->in_seq > con->in_seq_acked) {
3113                 ret = prepare_ack(con);
3114                 if (ret) {
3115                         pr_err("prepare_ack failed: %d\n", ret);
3116                         return ret;
3117                 }
3118         } else {
3119                 goto nothing_pending;
3120         }
3121
3122 populated:
3123         if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
3124                 return -ENODATA;
3125         dout("%s con %p populated %zu\n", __func__, con,
3126              iov_iter_count(&con->v2.out_iter));
3127         return 1;
3128
3129 nothing_pending:
3130         WARN_ON(iov_iter_count(&con->v2.out_iter));
3131         dout("%s con %p nothing pending\n", __func__, con);
3132         ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
3133         return 0;
3134 }
3135
3136 int ceph_con_v2_try_write(struct ceph_connection *con)
3137 {
3138         int ret;
3139
3140         dout("%s con %p state %d have %zu\n", __func__, con, con->state,
3141              iov_iter_count(&con->v2.out_iter));
3142
3143         /* open the socket first? */
3144         if (con->state == CEPH_CON_S_PREOPEN) {
3145                 WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
3146
3147                 /*
3148                  * Always bump global_seq.  Bump connect_seq only if
3149                  * there is a session (i.e. we are reconnecting and will
3150                  * send session_reconnect instead of client_ident).
3151                  */
3152                 con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
3153                 if (con->v2.server_cookie)
3154                         con->v2.connect_seq++;
3155
3156                 ret = prepare_read_banner_prefix(con);
3157                 if (ret) {
3158                         pr_err("prepare_read_banner_prefix failed: %d\n", ret);
3159                         con->error_msg = "connect error";
3160                         return ret;
3161                 }
3162
3163                 reset_out_kvecs(con);
3164                 ret = prepare_banner(con);
3165                 if (ret) {
3166                         pr_err("prepare_banner failed: %d\n", ret);
3167                         con->error_msg = "connect error";
3168                         return ret;
3169                 }
3170
3171                 ret = ceph_tcp_connect(con);
3172                 if (ret) {
3173                         pr_err("ceph_tcp_connect failed: %d\n", ret);
3174                         con->error_msg = "connect error";
3175                         return ret;
3176                 }
3177         }
3178
3179         if (!iov_iter_count(&con->v2.out_iter)) {
3180                 ret = populate_out_iter(con);
3181                 if (ret <= 0) {
3182                         if (ret && ret != -EAGAIN && !con->error_msg)
3183                                 con->error_msg = "write processing error";
3184                         return ret;
3185                 }
3186         }
3187
3188         tcp_sock_set_cork(con->sock->sk, true);
3189         for (;;) {
3190                 ret = ceph_tcp_send(con);
3191                 if (ret <= 0)
3192                         break;
3193
3194                 ret = populate_out_iter(con);
3195                 if (ret <= 0) {
3196                         if (ret && ret != -EAGAIN && !con->error_msg)
3197                                 con->error_msg = "write processing error";
3198                         break;
3199                 }
3200         }
3201
3202         tcp_sock_set_cork(con->sock->sk, false);
3203         return ret;
3204 }
3205
3206 static u32 crc32c_zeros(u32 crc, int zero_len)
3207 {
3208         int len;
3209
3210         while (zero_len) {
3211                 len = min(zero_len, (int)PAGE_SIZE);
3212                 crc = crc32c(crc, page_address(ceph_zero_page), len);
3213                 zero_len -= len;
3214         }
3215
3216         return crc;
3217 }
3218
3219 static void prepare_zero_front(struct ceph_connection *con, int resid)
3220 {
3221         int sent;
3222
3223         WARN_ON(!resid || resid > front_len(con->out_msg));
3224         sent = front_len(con->out_msg) - resid;
3225         dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
3226
3227         if (sent) {
3228                 con->v2.out_epil.front_crc =
3229                         crc32c(-1, con->out_msg->front.iov_base, sent);
3230                 con->v2.out_epil.front_crc =
3231                         crc32c_zeros(con->v2.out_epil.front_crc, resid);
3232         } else {
3233                 con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
3234         }
3235
3236         con->v2.out_iter.count -= resid;
3237         out_zero_add(con, resid);
3238 }
3239
3240 static void prepare_zero_middle(struct ceph_connection *con, int resid)
3241 {
3242         int sent;
3243
3244         WARN_ON(!resid || resid > middle_len(con->out_msg));
3245         sent = middle_len(con->out_msg) - resid;
3246         dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
3247
3248         if (sent) {
3249                 con->v2.out_epil.middle_crc =
3250                         crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
3251                 con->v2.out_epil.middle_crc =
3252                         crc32c_zeros(con->v2.out_epil.middle_crc, resid);
3253         } else {
3254                 con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
3255         }
3256
3257         con->v2.out_iter.count -= resid;
3258         out_zero_add(con, resid);
3259 }
3260
3261 static void prepare_zero_data(struct ceph_connection *con)
3262 {
3263         dout("%s con %p\n", __func__, con);
3264         con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
3265         out_zero_add(con, data_len(con->out_msg));
3266 }
3267
3268 static void revoke_at_queue_data(struct ceph_connection *con)
3269 {
3270         int boundary;
3271         int resid;
3272
3273         WARN_ON(!data_len(con->out_msg));
3274         WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
3275         resid = iov_iter_count(&con->v2.out_iter);
3276
3277         boundary = front_len(con->out_msg) + middle_len(con->out_msg);
3278         if (resid > boundary) {
3279                 resid -= boundary;
3280                 WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
3281                 dout("%s con %p was sending head\n", __func__, con);
3282                 if (front_len(con->out_msg))
3283                         prepare_zero_front(con, front_len(con->out_msg));
3284                 if (middle_len(con->out_msg))
3285                         prepare_zero_middle(con, middle_len(con->out_msg));
3286                 prepare_zero_data(con);
3287                 WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
3288                 con->v2.out_state = OUT_S_QUEUE_ZEROS;
3289                 return;
3290         }
3291
3292         boundary = middle_len(con->out_msg);
3293         if (resid > boundary) {
3294                 resid -= boundary;
3295                 dout("%s con %p was sending front\n", __func__, con);
3296                 prepare_zero_front(con, resid);
3297                 if (middle_len(con->out_msg))
3298                         prepare_zero_middle(con, middle_len(con->out_msg));
3299                 prepare_zero_data(con);
3300                 queue_zeros(con);
3301                 return;
3302         }
3303
3304         WARN_ON(!resid);
3305         dout("%s con %p was sending middle\n", __func__, con);
3306         prepare_zero_middle(con, resid);
3307         prepare_zero_data(con);
3308         queue_zeros(con);
3309 }
3310
3311 static void revoke_at_queue_data_cont(struct ceph_connection *con)
3312 {
3313         int sent, resid;  /* current piece of data */
3314
3315         WARN_ON(!data_len(con->out_msg));
3316         WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
3317         resid = iov_iter_count(&con->v2.out_iter);
3318         WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
3319         sent = con->v2.out_bvec.bv_len - resid;
3320         dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
3321
3322         if (sent) {
3323                 con->v2.out_epil.data_crc = ceph_crc32c_page(
3324                         con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
3325                         con->v2.out_bvec.bv_offset, sent);
3326                 ceph_msg_data_advance(&con->v2.out_cursor, sent);
3327         }
3328         WARN_ON(resid > con->v2.out_cursor.total_resid);
3329         con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
3330                                                 con->v2.out_cursor.total_resid);
3331
3332         con->v2.out_iter.count -= resid;
3333         out_zero_add(con, con->v2.out_cursor.total_resid);
3334         queue_zeros(con);
3335 }
3336
3337 static void revoke_at_finish_message(struct ceph_connection *con)
3338 {
3339         int boundary;
3340         int resid;
3341
3342         WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
3343         resid = iov_iter_count(&con->v2.out_iter);
3344
3345         if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
3346             !data_len(con->out_msg)) {
3347                 WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
3348                 dout("%s con %p was sending head (empty message) - noop\n",
3349                      __func__, con);
3350                 return;
3351         }
3352
3353         boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
3354                    CEPH_EPILOGUE_PLAIN_LEN;
3355         if (resid > boundary) {
3356                 resid -= boundary;
3357                 WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
3358                 dout("%s con %p was sending head\n", __func__, con);
3359                 if (front_len(con->out_msg))
3360                         prepare_zero_front(con, front_len(con->out_msg));
3361                 if (middle_len(con->out_msg))
3362                         prepare_zero_middle(con, middle_len(con->out_msg));
3363                 con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
3364                 WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
3365                 con->v2.out_state = OUT_S_QUEUE_ZEROS;
3366                 return;
3367         }
3368
3369         boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
3370         if (resid > boundary) {
3371                 resid -= boundary;
3372                 dout("%s con %p was sending front\n", __func__, con);
3373                 prepare_zero_front(con, resid);
3374                 if (middle_len(con->out_msg))
3375                         prepare_zero_middle(con, middle_len(con->out_msg));
3376                 con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
3377                 queue_zeros(con);
3378                 return;
3379         }
3380
3381         boundary = CEPH_EPILOGUE_PLAIN_LEN;
3382         if (resid > boundary) {
3383                 resid -= boundary;
3384                 dout("%s con %p was sending middle\n", __func__, con);
3385                 prepare_zero_middle(con, resid);
3386                 con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
3387                 queue_zeros(con);
3388                 return;
3389         }
3390
3391         WARN_ON(!resid);
3392         dout("%s con %p was sending epilogue - noop\n", __func__, con);
3393 }
3394
3395 void ceph_con_v2_revoke(struct ceph_connection *con)
3396 {
3397         WARN_ON(con->v2.out_zero);
3398
3399         if (con_secure(con)) {
3400                 WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
3401                         con->v2.out_state != OUT_S_FINISH_MESSAGE);
3402                 dout("%s con %p secure - noop\n", __func__, con);
3403                 return;
3404         }
3405
3406         switch (con->v2.out_state) {
3407         case OUT_S_QUEUE_DATA:
3408                 revoke_at_queue_data(con);
3409                 break;
3410         case OUT_S_QUEUE_DATA_CONT:
3411                 revoke_at_queue_data_cont(con);
3412                 break;
3413         case OUT_S_FINISH_MESSAGE:
3414                 revoke_at_finish_message(con);
3415                 break;
3416         default:
3417                 WARN(1, "bad out_state %d", con->v2.out_state);
3418                 break;
3419         }
3420 }
3421
3422 static void revoke_at_prepare_read_data(struct ceph_connection *con)
3423 {
3424         int remaining;
3425         int resid;
3426
3427         WARN_ON(con_secure(con));
3428         WARN_ON(!data_len(con->in_msg));
3429         WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
3430         resid = iov_iter_count(&con->v2.in_iter);
3431         WARN_ON(!resid);
3432
3433         remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
3434         dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
3435              remaining);
3436         con->v2.in_iter.count -= resid;
3437         set_in_skip(con, resid + remaining);
3438         con->v2.in_state = IN_S_FINISH_SKIP;
3439 }
3440
3441 static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
3442 {
3443         int recved, resid;  /* current piece of data */
3444         int remaining;
3445
3446         WARN_ON(con_secure(con));
3447         WARN_ON(!data_len(con->in_msg));
3448         WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
3449         resid = iov_iter_count(&con->v2.in_iter);
3450         WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
3451         recved = con->v2.in_bvec.bv_len - resid;
3452         dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
3453
3454         if (recved)
3455                 ceph_msg_data_advance(&con->v2.in_cursor, recved);
3456         WARN_ON(resid > con->v2.in_cursor.total_resid);
3457
3458         remaining = CEPH_EPILOGUE_PLAIN_LEN;
3459         dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
3460              con->v2.in_cursor.total_resid, remaining);
3461         con->v2.in_iter.count -= resid;
3462         set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
3463         con->v2.in_state = IN_S_FINISH_SKIP;
3464 }
3465
3466 static void revoke_at_prepare_read_enc_page(struct ceph_connection *con)
3467 {
3468         int resid;  /* current enc page (not necessarily data) */
3469
3470         WARN_ON(!con_secure(con));
3471         WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
3472         resid = iov_iter_count(&con->v2.in_iter);
3473         WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
3474
3475         dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid,
3476              con->v2.in_enc_resid);
3477         con->v2.in_iter.count -= resid;
3478         set_in_skip(con, resid + con->v2.in_enc_resid);
3479         con->v2.in_state = IN_S_FINISH_SKIP;
3480 }
3481
3482 static void revoke_at_handle_epilogue(struct ceph_connection *con)
3483 {
3484         int resid;
3485
3486         resid = iov_iter_count(&con->v2.in_iter);
3487         WARN_ON(!resid);
3488
3489         dout("%s con %p resid %d\n", __func__, con, resid);
3490         con->v2.in_iter.count -= resid;
3491         set_in_skip(con, resid);
3492         con->v2.in_state = IN_S_FINISH_SKIP;
3493 }
3494
3495 void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
3496 {
3497         switch (con->v2.in_state) {
3498         case IN_S_PREPARE_READ_DATA:
3499                 revoke_at_prepare_read_data(con);
3500                 break;
3501         case IN_S_PREPARE_READ_DATA_CONT:
3502                 revoke_at_prepare_read_data_cont(con);
3503                 break;
3504         case IN_S_PREPARE_READ_ENC_PAGE:
3505                 revoke_at_prepare_read_enc_page(con);
3506                 break;
3507         case IN_S_HANDLE_EPILOGUE:
3508                 revoke_at_handle_epilogue(con);
3509                 break;
3510         default:
3511                 WARN(1, "bad in_state %d", con->v2.in_state);
3512                 break;
3513         }
3514 }
3515
3516 bool ceph_con_v2_opened(struct ceph_connection *con)
3517 {
3518         return con->v2.peer_global_seq;
3519 }
3520
3521 void ceph_con_v2_reset_session(struct ceph_connection *con)
3522 {
3523         con->v2.client_cookie = 0;
3524         con->v2.server_cookie = 0;
3525         con->v2.global_seq = 0;
3526         con->v2.connect_seq = 0;
3527         con->v2.peer_global_seq = 0;
3528 }
3529
3530 void ceph_con_v2_reset_protocol(struct ceph_connection *con)
3531 {
3532         iov_iter_truncate(&con->v2.in_iter, 0);
3533         iov_iter_truncate(&con->v2.out_iter, 0);
3534         con->v2.out_zero = 0;
3535
3536         clear_in_sign_kvecs(con);
3537         clear_out_sign_kvecs(con);
3538         free_conn_bufs(con);
3539
3540         if (con->v2.in_enc_pages) {
3541                 WARN_ON(!con->v2.in_enc_page_cnt);
3542                 ceph_release_page_vector(con->v2.in_enc_pages,
3543                                          con->v2.in_enc_page_cnt);
3544                 con->v2.in_enc_pages = NULL;
3545                 con->v2.in_enc_page_cnt = 0;
3546         }
3547         if (con->v2.out_enc_pages) {
3548                 WARN_ON(!con->v2.out_enc_page_cnt);
3549                 ceph_release_page_vector(con->v2.out_enc_pages,
3550                                          con->v2.out_enc_page_cnt);
3551                 con->v2.out_enc_pages = NULL;
3552                 con->v2.out_enc_page_cnt = 0;
3553         }
3554
3555         con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
3556         memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN);
3557         memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN);
3558
3559         if (con->v2.hmac_tfm) {
3560                 crypto_free_shash(con->v2.hmac_tfm);
3561                 con->v2.hmac_tfm = NULL;
3562         }
3563         if (con->v2.gcm_req) {
3564                 aead_request_free(con->v2.gcm_req);
3565                 con->v2.gcm_req = NULL;
3566         }
3567         if (con->v2.gcm_tfm) {
3568                 crypto_free_aead(con->v2.gcm_tfm);
3569                 con->v2.gcm_tfm = NULL;
3570         }
3571 }