pds_core: devlink params for enabling VIF support
[linux-block.git] / drivers / block / drbd / drbd_receiver.c
CommitLineData
93c68cc4 1// SPDX-License-Identifier: GPL-2.0-only
b411b363
PR
2/*
3 drbd_receiver.c
4
5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6
7 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10
b411b363
PR
11 */
12
13
b411b363
PR
14#include <linux/module.h>
15
7e5fec31 16#include <linux/uaccess.h>
b411b363
PR
17#include <net/sock.h>
18
b411b363
PR
19#include <linux/drbd.h>
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/in.h>
23#include <linux/mm.h>
24#include <linux/memcontrol.h>
25#include <linux/mm_inline.h>
26#include <linux/slab.h>
ae7e81c0 27#include <uapi/linux/sched/types.h>
174cd4b1 28#include <linux/sched/signal.h>
b411b363
PR
29#include <linux/pkt_sched.h>
30#define __KERNEL_SYSCALLS__
31#include <linux/unistd.h>
32#include <linux/vmalloc.h>
33#include <linux/random.h>
b411b363
PR
34#include <linux/string.h>
35#include <linux/scatterlist.h>
c6a564ff 36#include <linux/part_stat.h>
b411b363 37#include "drbd_int.h"
a3603a6e 38#include "drbd_protocol.h"
b411b363 39#include "drbd_req.h"
b411b363
PR
40#include "drbd_vli.h"
41
f31e583a 42#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES)
20c68fde 43
77351055
PR
44struct packet_info {
45 enum drbd_packet cmd;
e2857216
AG
46 unsigned int size;
47 unsigned int vnr;
e658983a 48 void *data;
77351055
PR
49};
50
b411b363
PR
51enum finish_epoch {
52 FE_STILL_LIVE,
53 FE_DESTROYED,
54 FE_RECYCLED,
55};
56
bde89a9e
AG
57static int drbd_do_features(struct drbd_connection *connection);
58static int drbd_do_auth(struct drbd_connection *connection);
69a22773 59static int drbd_disconnected(struct drbd_peer_device *);
a0fb3c47 60static void conn_wait_active_ee_empty(struct drbd_connection *connection);
bde89a9e 61static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
99920dc5 62static int e_end_block(struct drbd_work *, int);
b411b363 63
b411b363
PR
64
65#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
66
45bb912b
LE
67/*
68 * some helper functions to deal with single linked page lists,
69 * page->private being our "next" pointer.
70 */
71
72/* If at least n pages are linked at head, get n pages off.
73 * Otherwise, don't modify head, and return NULL.
74 * Locking is the responsibility of the caller.
75 */
76static struct page *page_chain_del(struct page **head, int n)
77{
78 struct page *page;
79 struct page *tmp;
80
81 BUG_ON(!n);
82 BUG_ON(!head);
83
84 page = *head;
23ce4227
PR
85
86 if (!page)
87 return NULL;
88
45bb912b
LE
89 while (page) {
90 tmp = page_chain_next(page);
91 if (--n == 0)
92 break; /* found sufficient pages */
93 if (tmp == NULL)
94 /* insufficient pages, don't use any of them. */
95 return NULL;
96 page = tmp;
97 }
98
99 /* add end of list marker for the returned list */
100 set_page_private(page, 0);
101 /* actual return value, and adjustment of head */
102 page = *head;
103 *head = tmp;
104 return page;
105}
106
107/* may be used outside of locks to find the tail of a (usually short)
108 * "private" page chain, before adding it back to a global chain head
109 * with page_chain_add() under a spinlock. */
110static struct page *page_chain_tail(struct page *page, int *len)
111{
112 struct page *tmp;
113 int i = 1;
e8628013
JP
114 while ((tmp = page_chain_next(page))) {
115 ++i;
116 page = tmp;
117 }
45bb912b
LE
118 if (len)
119 *len = i;
120 return page;
121}
122
123static int page_chain_free(struct page *page)
124{
125 struct page *tmp;
126 int i = 0;
127 page_chain_for_each_safe(page, tmp) {
128 put_page(page);
129 ++i;
130 }
131 return i;
132}
133
134static void page_chain_add(struct page **head,
135 struct page *chain_first, struct page *chain_last)
136{
137#if 1
138 struct page *tmp;
139 tmp = page_chain_tail(chain_first, NULL);
140 BUG_ON(tmp != chain_last);
141#endif
142
143 /* add chain to head */
144 set_page_private(chain_last, (unsigned long)*head);
145 *head = chain_first;
146}
147
b30ab791 148static struct page *__drbd_alloc_pages(struct drbd_device *device,
18c2d522 149 unsigned int number)
b411b363
PR
150{
151 struct page *page = NULL;
45bb912b 152 struct page *tmp = NULL;
18c2d522 153 unsigned int i = 0;
b411b363
PR
154
155 /* Yes, testing drbd_pp_vacant outside the lock is racy.
156 * So what. It saves a spin_lock. */
45bb912b 157 if (drbd_pp_vacant >= number) {
b411b363 158 spin_lock(&drbd_pp_lock);
45bb912b
LE
159 page = page_chain_del(&drbd_pp_pool, number);
160 if (page)
161 drbd_pp_vacant -= number;
b411b363 162 spin_unlock(&drbd_pp_lock);
45bb912b
LE
163 if (page)
164 return page;
b411b363 165 }
45bb912b 166
b411b363
PR
167 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
168 * "criss-cross" setup, that might cause write-out on some other DRBD,
169 * which in turn might block on the other node at this very place. */
45bb912b
LE
170 for (i = 0; i < number; i++) {
171 tmp = alloc_page(GFP_TRY);
172 if (!tmp)
173 break;
174 set_page_private(tmp, (unsigned long)page);
175 page = tmp;
176 }
177
178 if (i == number)
179 return page;
180
181 /* Not enough pages immediately available this time.
c37c8ecf 182 * No need to jump around here, drbd_alloc_pages will retry this
45bb912b
LE
183 * function "soon". */
184 if (page) {
185 tmp = page_chain_tail(page, NULL);
186 spin_lock(&drbd_pp_lock);
187 page_chain_add(&drbd_pp_pool, page, tmp);
188 drbd_pp_vacant += i;
189 spin_unlock(&drbd_pp_lock);
190 }
191 return NULL;
b411b363
PR
192}
193
b30ab791 194static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
a990be46 195 struct list_head *to_be_freed)
b411b363 196{
a8cd15ba 197 struct drbd_peer_request *peer_req, *tmp;
b411b363
PR
198
199 /* The EEs are always appended to the end of the list. Since
200 they are sent in order over the wire, they have to finish
201 in order. As soon as we see the first not finished we can
202 stop to examine the list... */
203
a8cd15ba 204 list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
045417f7 205 if (drbd_peer_req_has_active_page(peer_req))
b411b363 206 break;
a8cd15ba 207 list_move(&peer_req->w.list, to_be_freed);
b411b363
PR
208 }
209}
210
668700b4 211static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
b411b363
PR
212{
213 LIST_HEAD(reclaimed);
db830c46 214 struct drbd_peer_request *peer_req, *t;
b411b363 215
0500813f 216 spin_lock_irq(&device->resource->req_lock);
b30ab791 217 reclaim_finished_net_peer_reqs(device, &reclaimed);
0500813f 218 spin_unlock_irq(&device->resource->req_lock);
a8cd15ba 219 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
b30ab791 220 drbd_free_net_peer_req(device, peer_req);
b411b363
PR
221}
222
668700b4
PR
223static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
224{
225 struct drbd_peer_device *peer_device;
226 int vnr;
227
228 rcu_read_lock();
229 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
230 struct drbd_device *device = peer_device->device;
231 if (!atomic_read(&device->pp_in_use_by_net))
232 continue;
233
234 kref_get(&device->kref);
235 rcu_read_unlock();
236 drbd_reclaim_net_peer_reqs(device);
237 kref_put(&device->kref, drbd_destroy_device);
238 rcu_read_lock();
239 }
240 rcu_read_unlock();
241}
242
b411b363 243/**
c37c8ecf 244 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
9b48ff07
LJ
245 * @peer_device: DRBD device.
246 * @number: number of pages requested
247 * @retry: whether to retry, if not enough pages are available right now
45bb912b
LE
248 *
249 * Tries to allocate number pages, first from our own page pool, then from
0e49d7b0 250 * the kernel.
45bb912b 251 * Possibly retry until DRBD frees sufficient pages somewhere else.
b411b363 252 *
0e49d7b0
LE
253 * If this allocation would exceed the max_buffers setting, we throttle
254 * allocation (schedule_timeout) to give the system some room to breathe.
255 *
256 * We do not use max-buffers as hard limit, because it could lead to
257 * congestion and further to a distributed deadlock during online-verify or
258 * (checksum based) resync, if the max-buffers, socket buffer sizes and
259 * resync-rate settings are mis-configured.
260 *
45bb912b 261 * Returns a page chain linked via page->private.
b411b363 262 */
69a22773 263struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
c37c8ecf 264 bool retry)
b411b363 265{
69a22773 266 struct drbd_device *device = peer_device->device;
b411b363 267 struct page *page = NULL;
44ed167d 268 struct net_conf *nc;
b411b363 269 DEFINE_WAIT(wait);
0e49d7b0 270 unsigned int mxb;
b411b363 271
44ed167d 272 rcu_read_lock();
69a22773 273 nc = rcu_dereference(peer_device->connection->net_conf);
44ed167d
PR
274 mxb = nc ? nc->max_buffers : 1000000;
275 rcu_read_unlock();
276
b30ab791
AG
277 if (atomic_read(&device->pp_in_use) < mxb)
278 page = __drbd_alloc_pages(device, number);
b411b363 279
668700b4
PR
280 /* Try to keep the fast path fast, but occasionally we need
281 * to reclaim the pages we lended to the network stack. */
282 if (page && atomic_read(&device->pp_in_use_by_net) > 512)
283 drbd_reclaim_net_peer_reqs(device);
284
45bb912b 285 while (page == NULL) {
b411b363
PR
286 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
287
668700b4 288 drbd_reclaim_net_peer_reqs(device);
b411b363 289
b30ab791
AG
290 if (atomic_read(&device->pp_in_use) < mxb) {
291 page = __drbd_alloc_pages(device, number);
b411b363
PR
292 if (page)
293 break;
294 }
295
296 if (!retry)
297 break;
298
299 if (signal_pending(current)) {
d0180171 300 drbd_warn(device, "drbd_alloc_pages interrupted!\n");
b411b363
PR
301 break;
302 }
303
0e49d7b0
LE
304 if (schedule_timeout(HZ/10) == 0)
305 mxb = UINT_MAX;
b411b363
PR
306 }
307 finish_wait(&drbd_pp_wait, &wait);
308
45bb912b 309 if (page)
b30ab791 310 atomic_add(number, &device->pp_in_use);
b411b363
PR
311 return page;
312}
313
c37c8ecf 314/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
0500813f 315 * Is also used from inside an other spin_lock_irq(&resource->req_lock);
45bb912b
LE
316 * Either links the page chain back to the global pool,
317 * or returns all pages to the system. */
b30ab791 318static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
b411b363 319{
b30ab791 320 atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
b411b363 321 int i;
435f0740 322
a73ff323
LE
323 if (page == NULL)
324 return;
325
183ece30 326 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
45bb912b
LE
327 i = page_chain_free(page);
328 else {
329 struct page *tmp;
330 tmp = page_chain_tail(page, &i);
331 spin_lock(&drbd_pp_lock);
332 page_chain_add(&drbd_pp_pool, page, tmp);
333 drbd_pp_vacant += i;
334 spin_unlock(&drbd_pp_lock);
b411b363 335 }
435f0740 336 i = atomic_sub_return(i, a);
45bb912b 337 if (i < 0)
d0180171 338 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
435f0740 339 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
b411b363
PR
340 wake_up(&drbd_pp_wait);
341}
342
343/*
344You need to hold the req_lock:
345 _drbd_wait_ee_list_empty()
346
347You must not have the req_lock:
3967deb1 348 drbd_free_peer_req()
0db55363 349 drbd_alloc_peer_req()
7721f567 350 drbd_free_peer_reqs()
b411b363 351 drbd_ee_fix_bhs()
a990be46 352 drbd_finish_peer_reqs()
b411b363
PR
353 drbd_clear_done_ee()
354 drbd_wait_ee_list_empty()
355*/
356
9104d31a
LE
357/* normal: payload_size == request size (bi_size)
358 * w_same: payload_size == logical_block_size
359 * trim: payload_size == 0 */
f6ffca9f 360struct drbd_peer_request *
69a22773 361drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
9104d31a 362 unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
b411b363 363{
69a22773 364 struct drbd_device *device = peer_device->device;
db830c46 365 struct drbd_peer_request *peer_req;
a73ff323 366 struct page *page = NULL;
e6be38a1 367 unsigned int nr_pages = PFN_UP(payload_size);
b411b363 368
b30ab791 369 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
b411b363
PR
370 return NULL;
371
0892fac8 372 peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
db830c46 373 if (!peer_req) {
b411b363 374 if (!(gfp_mask & __GFP_NOWARN))
d0180171 375 drbd_err(device, "%s: allocation failed\n", __func__);
b411b363
PR
376 return NULL;
377 }
378
9104d31a 379 if (nr_pages) {
d0164adc
MG
380 page = drbd_alloc_pages(peer_device, nr_pages,
381 gfpflags_allow_blocking(gfp_mask));
a73ff323
LE
382 if (!page)
383 goto fail;
384 }
b411b363 385
c5a2c150
LE
386 memset(peer_req, 0, sizeof(*peer_req));
387 INIT_LIST_HEAD(&peer_req->w.list);
db830c46 388 drbd_clear_interval(&peer_req->i);
9104d31a 389 peer_req->i.size = request_size;
db830c46 390 peer_req->i.sector = sector;
c5a2c150 391 peer_req->submit_jif = jiffies;
a8cd15ba 392 peer_req->peer_device = peer_device;
db830c46 393 peer_req->pages = page;
9a8e7753
AG
394 /*
395 * The block_id is opaque to the receiver. It is not endianness
396 * converted, and sent back to the sender unchanged.
397 */
db830c46 398 peer_req->block_id = id;
b411b363 399
db830c46 400 return peer_req;
b411b363 401
45bb912b 402 fail:
0892fac8 403 mempool_free(peer_req, &drbd_ee_mempool);
b411b363
PR
404 return NULL;
405}
406
b30ab791 407void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
f6ffca9f 408 int is_net)
b411b363 409{
21ae5d7f 410 might_sleep();
db830c46
AG
411 if (peer_req->flags & EE_HAS_DIGEST)
412 kfree(peer_req->digest);
b30ab791 413 drbd_free_pages(device, peer_req->pages, is_net);
0b0ba1ef
AG
414 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
415 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
677b3672 416 if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
21ae5d7f
LE
417 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
418 drbd_al_complete_io(device, &peer_req->i);
419 }
0892fac8 420 mempool_free(peer_req, &drbd_ee_mempool);
b411b363
PR
421}
422
b30ab791 423int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
b411b363
PR
424{
425 LIST_HEAD(work_list);
db830c46 426 struct drbd_peer_request *peer_req, *t;
b411b363 427 int count = 0;
b30ab791 428 int is_net = list == &device->net_ee;
b411b363 429
0500813f 430 spin_lock_irq(&device->resource->req_lock);
b411b363 431 list_splice_init(list, &work_list);
0500813f 432 spin_unlock_irq(&device->resource->req_lock);
b411b363 433
a8cd15ba 434 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
b30ab791 435 __drbd_free_peer_req(device, peer_req, is_net);
b411b363
PR
436 count++;
437 }
438 return count;
439}
440
b411b363 441/*
a990be46 442 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
b411b363 443 */
b30ab791 444static int drbd_finish_peer_reqs(struct drbd_device *device)
b411b363
PR
445{
446 LIST_HEAD(work_list);
447 LIST_HEAD(reclaimed);
db830c46 448 struct drbd_peer_request *peer_req, *t;
e2b3032b 449 int err = 0;
b411b363 450
0500813f 451 spin_lock_irq(&device->resource->req_lock);
b30ab791
AG
452 reclaim_finished_net_peer_reqs(device, &reclaimed);
453 list_splice_init(&device->done_ee, &work_list);
0500813f 454 spin_unlock_irq(&device->resource->req_lock);
b411b363 455
a8cd15ba 456 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
b30ab791 457 drbd_free_net_peer_req(device, peer_req);
b411b363
PR
458
459 /* possible callbacks here:
d4dabbe2 460 * e_end_block, and e_end_resync_block, e_send_superseded.
b411b363
PR
461 * all ignore the last argument.
462 */
a8cd15ba 463 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
e2b3032b
AG
464 int err2;
465
b411b363 466 /* list_del not necessary, next/prev members not touched */
a8cd15ba 467 err2 = peer_req->w.cb(&peer_req->w, !!err);
e2b3032b
AG
468 if (!err)
469 err = err2;
b30ab791 470 drbd_free_peer_req(device, peer_req);
b411b363 471 }
b30ab791 472 wake_up(&device->ee_wait);
b411b363 473
e2b3032b 474 return err;
b411b363
PR
475}
476
b30ab791 477static void _drbd_wait_ee_list_empty(struct drbd_device *device,
d4da1537 478 struct list_head *head)
b411b363
PR
479{
480 DEFINE_WAIT(wait);
481
482 /* avoids spin_lock/unlock
483 * and calling prepare_to_wait in the fast path */
484 while (!list_empty(head)) {
b30ab791 485 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
0500813f 486 spin_unlock_irq(&device->resource->req_lock);
7eaceacc 487 io_schedule();
b30ab791 488 finish_wait(&device->ee_wait, &wait);
0500813f 489 spin_lock_irq(&device->resource->req_lock);
b411b363
PR
490 }
491}
492
b30ab791 493static void drbd_wait_ee_list_empty(struct drbd_device *device,
d4da1537 494 struct list_head *head)
b411b363 495{
0500813f 496 spin_lock_irq(&device->resource->req_lock);
b30ab791 497 _drbd_wait_ee_list_empty(device, head);
0500813f 498 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
499}
500
dbd9eea0 501static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
b411b363 502{
b411b363
PR
503 struct kvec iov = {
504 .iov_base = buf,
505 .iov_len = size,
506 };
507 struct msghdr msg = {
b411b363
PR
508 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
509 };
de4eda9d 510 iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size);
f7765c36 511 return sock_recvmsg(sock, &msg, msg.msg_flags);
b411b363
PR
512}
513
bde89a9e 514static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
b411b363 515{
b411b363
PR
516 int rv;
517
bde89a9e 518 rv = drbd_recv_short(connection->data.socket, buf, size, 0);
b411b363 519
dbd0820c
PR
520 if (rv < 0) {
521 if (rv == -ECONNRESET)
1ec861eb 522 drbd_info(connection, "sock was reset by peer\n");
dbd0820c 523 else if (rv != -ERESTARTSYS)
1ec861eb 524 drbd_err(connection, "sock_recvmsg returned %d\n", rv);
dbd0820c 525 } else if (rv == 0) {
bde89a9e 526 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
b66623e3
PR
527 long t;
528 rcu_read_lock();
bde89a9e 529 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
b66623e3
PR
530 rcu_read_unlock();
531
bde89a9e 532 t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
b66623e3 533
599377ac
PR
534 if (t)
535 goto out;
536 }
1ec861eb 537 drbd_info(connection, "sock was shut down by peer\n");
599377ac
PR
538 }
539
b411b363 540 if (rv != size)
bde89a9e 541 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 542
599377ac 543out:
b411b363
PR
544 return rv;
545}
546
bde89a9e 547static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
c6967746
AG
548{
549 int err;
550
bde89a9e 551 err = drbd_recv(connection, buf, size);
c6967746
AG
552 if (err != size) {
553 if (err >= 0)
554 err = -EIO;
555 } else
556 err = 0;
557 return err;
558}
559
bde89a9e 560static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
a5c31904
AG
561{
562 int err;
563
bde89a9e 564 err = drbd_recv_all(connection, buf, size);
a5c31904 565 if (err && !signal_pending(current))
1ec861eb 566 drbd_warn(connection, "short read (expected size %d)\n", (int)size);
a5c31904
AG
567 return err;
568}
569
5dbf1673
LE
570/* quoting tcp(7):
571 * On individual connections, the socket buffer size must be set prior to the
572 * listen(2) or connect(2) calls in order to have it take effect.
573 * This is our wrapper to do so.
574 */
575static void drbd_setbufsize(struct socket *sock, unsigned int snd,
576 unsigned int rcv)
577{
578 /* open coded SO_SNDBUF, SO_RCVBUF */
579 if (snd) {
580 sock->sk->sk_sndbuf = snd;
581 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
582 }
583 if (rcv) {
584 sock->sk->sk_rcvbuf = rcv;
585 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
586 }
587}
588
bde89a9e 589static struct socket *drbd_try_connect(struct drbd_connection *connection)
b411b363
PR
590{
591 const char *what;
592 struct socket *sock;
593 struct sockaddr_in6 src_in6;
44ed167d
PR
594 struct sockaddr_in6 peer_in6;
595 struct net_conf *nc;
596 int err, peer_addr_len, my_addr_len;
69ef82de 597 int sndbuf_size, rcvbuf_size, connect_int;
b411b363
PR
598 int disconnect_on_error = 1;
599
44ed167d 600 rcu_read_lock();
bde89a9e 601 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
602 if (!nc) {
603 rcu_read_unlock();
b411b363 604 return NULL;
44ed167d 605 }
44ed167d
PR
606 sndbuf_size = nc->sndbuf_size;
607 rcvbuf_size = nc->rcvbuf_size;
69ef82de 608 connect_int = nc->connect_int;
089c075d 609 rcu_read_unlock();
44ed167d 610
bde89a9e
AG
611 my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
612 memcpy(&src_in6, &connection->my_addr, my_addr_len);
44ed167d 613
bde89a9e 614 if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
44ed167d
PR
615 src_in6.sin6_port = 0;
616 else
617 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
618
bde89a9e
AG
619 peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
620 memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
b411b363
PR
621
622 what = "sock_create_kern";
eeb1bd5c 623 err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
44ed167d 624 SOCK_STREAM, IPPROTO_TCP, &sock);
b411b363
PR
625 if (err < 0) {
626 sock = NULL;
627 goto out;
628 }
629
630 sock->sk->sk_rcvtimeo =
69ef82de 631 sock->sk->sk_sndtimeo = connect_int * HZ;
44ed167d 632 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
b411b363
PR
633
634 /* explicitly bind to the configured IP as source IP
635 * for the outgoing connections.
636 * This is needed for multihomed hosts and to be
637 * able to use lo: interfaces for drbd.
638 * Make sure to use 0 as port number, so linux selects
639 * a free one dynamically.
640 */
b411b363 641 what = "bind before connect";
44ed167d 642 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
b411b363
PR
643 if (err < 0)
644 goto out;
645
646 /* connect may fail, peer not yet available.
647 * stay C_WF_CONNECTION, don't go Disconnecting! */
648 disconnect_on_error = 0;
649 what = "connect";
44ed167d 650 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
b411b363
PR
651
652out:
653 if (err < 0) {
654 if (sock) {
655 sock_release(sock);
656 sock = NULL;
657 }
658 switch (-err) {
659 /* timeout, busy, signal pending */
660 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
661 case EINTR: case ERESTARTSYS:
662 /* peer not (yet) available, network problem */
663 case ECONNREFUSED: case ENETUNREACH:
664 case EHOSTDOWN: case EHOSTUNREACH:
665 disconnect_on_error = 0;
666 break;
667 default:
1ec861eb 668 drbd_err(connection, "%s failed, err = %d\n", what, err);
b411b363
PR
669 }
670 if (disconnect_on_error)
bde89a9e 671 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 672 }
44ed167d 673
b411b363
PR
674 return sock;
675}
676
7a426fd8 677struct accept_wait_data {
bde89a9e 678 struct drbd_connection *connection;
7a426fd8
PR
679 struct socket *s_listen;
680 struct completion door_bell;
681 void (*original_sk_state_change)(struct sock *sk);
682
683};
684
715306f6 685static void drbd_incoming_connection(struct sock *sk)
7a426fd8
PR
686{
687 struct accept_wait_data *ad = sk->sk_user_data;
715306f6 688 void (*state_change)(struct sock *sk);
7a426fd8 689
715306f6
AG
690 state_change = ad->original_sk_state_change;
691 if (sk->sk_state == TCP_ESTABLISHED)
692 complete(&ad->door_bell);
693 state_change(sk);
7a426fd8
PR
694}
695
bde89a9e 696static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
b411b363 697{
1f3e509b 698 int err, sndbuf_size, rcvbuf_size, my_addr_len;
44ed167d 699 struct sockaddr_in6 my_addr;
1f3e509b 700 struct socket *s_listen;
44ed167d 701 struct net_conf *nc;
b411b363
PR
702 const char *what;
703
44ed167d 704 rcu_read_lock();
bde89a9e 705 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
706 if (!nc) {
707 rcu_read_unlock();
7a426fd8 708 return -EIO;
44ed167d 709 }
44ed167d
PR
710 sndbuf_size = nc->sndbuf_size;
711 rcvbuf_size = nc->rcvbuf_size;
44ed167d 712 rcu_read_unlock();
b411b363 713
bde89a9e
AG
714 my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
715 memcpy(&my_addr, &connection->my_addr, my_addr_len);
b411b363
PR
716
717 what = "sock_create_kern";
eeb1bd5c 718 err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
1f3e509b 719 SOCK_STREAM, IPPROTO_TCP, &s_listen);
b411b363
PR
720 if (err) {
721 s_listen = NULL;
722 goto out;
723 }
724
98683650 725 s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
44ed167d 726 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
b411b363
PR
727
728 what = "bind before listen";
44ed167d 729 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
b411b363
PR
730 if (err < 0)
731 goto out;
732
7a426fd8
PR
733 ad->s_listen = s_listen;
734 write_lock_bh(&s_listen->sk->sk_callback_lock);
735 ad->original_sk_state_change = s_listen->sk->sk_state_change;
715306f6 736 s_listen->sk->sk_state_change = drbd_incoming_connection;
7a426fd8
PR
737 s_listen->sk->sk_user_data = ad;
738 write_unlock_bh(&s_listen->sk->sk_callback_lock);
b411b363 739
2820fd39
PR
740 what = "listen";
741 err = s_listen->ops->listen(s_listen, 5);
742 if (err < 0)
743 goto out;
744
7a426fd8 745 return 0;
b411b363
PR
746out:
747 if (s_listen)
748 sock_release(s_listen);
749 if (err < 0) {
750 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1ec861eb 751 drbd_err(connection, "%s failed, err = %d\n", what, err);
bde89a9e 752 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
753 }
754 }
b411b363 755
7a426fd8 756 return -EIO;
b411b363
PR
757}
758
715306f6 759static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
b411b363 760{
715306f6
AG
761 write_lock_bh(&sk->sk_callback_lock);
762 sk->sk_state_change = ad->original_sk_state_change;
763 sk->sk_user_data = NULL;
764 write_unlock_bh(&sk->sk_callback_lock);
b411b363
PR
765}
766
bde89a9e 767static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
b411b363 768{
1f3e509b
PR
769 int timeo, connect_int, err = 0;
770 struct socket *s_estab = NULL;
1f3e509b
PR
771 struct net_conf *nc;
772
773 rcu_read_lock();
bde89a9e 774 nc = rcu_dereference(connection->net_conf);
1f3e509b
PR
775 if (!nc) {
776 rcu_read_unlock();
777 return NULL;
778 }
779 connect_int = nc->connect_int;
780 rcu_read_unlock();
781
782 timeo = connect_int * HZ;
38b682b2 783 /* 28.5% random jitter */
8032bf12 784 timeo += get_random_u32_below(2) ? timeo / 7 : -timeo / 7;
1f3e509b 785
7a426fd8
PR
786 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
787 if (err <= 0)
788 return NULL;
b411b363 789
7a426fd8 790 err = kernel_accept(ad->s_listen, &s_estab, 0);
b411b363
PR
791 if (err < 0) {
792 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1ec861eb 793 drbd_err(connection, "accept failed, err = %d\n", err);
bde89a9e 794 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
795 }
796 }
b411b363 797
715306f6
AG
798 if (s_estab)
799 unregister_state_change(s_estab->sk, ad);
b411b363 800
b411b363
PR
801 return s_estab;
802}
b411b363 803
bde89a9e 804static int decode_header(struct drbd_connection *, void *, struct packet_info *);
b411b363 805
bde89a9e 806static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
9f5bdc33
AG
807 enum drbd_packet cmd)
808{
bde89a9e 809 if (!conn_prepare_command(connection, sock))
9f5bdc33 810 return -EIO;
bde89a9e 811 return conn_send_command(connection, sock, cmd, 0, NULL, 0);
b411b363
PR
812}
813
bde89a9e 814static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
b411b363 815{
bde89a9e 816 unsigned int header_size = drbd_header_size(connection);
9f5bdc33 817 struct packet_info pi;
4920e37a 818 struct net_conf *nc;
9f5bdc33 819 int err;
b411b363 820
4920e37a
PR
821 rcu_read_lock();
822 nc = rcu_dereference(connection->net_conf);
823 if (!nc) {
824 rcu_read_unlock();
825 return -EIO;
826 }
827 sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
828 rcu_read_unlock();
829
bde89a9e 830 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
9f5bdc33
AG
831 if (err != header_size) {
832 if (err >= 0)
833 err = -EIO;
834 return err;
835 }
bde89a9e 836 err = decode_header(connection, connection->data.rbuf, &pi);
9f5bdc33
AG
837 if (err)
838 return err;
839 return pi.cmd;
b411b363
PR
840}
841
842/**
843 * drbd_socket_okay() - Free the socket if its connection is not okay
b411b363
PR
844 * @sock: pointer to the pointer to the socket.
845 */
5d0b17f1 846static bool drbd_socket_okay(struct socket **sock)
b411b363
PR
847{
848 int rr;
849 char tb[4];
850
851 if (!*sock)
81e84650 852 return false;
b411b363 853
dbd9eea0 854 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
b411b363
PR
855
856 if (rr > 0 || rr == -EAGAIN) {
81e84650 857 return true;
b411b363
PR
858 } else {
859 sock_release(*sock);
860 *sock = NULL;
81e84650 861 return false;
b411b363
PR
862 }
863}
5d0b17f1
PR
864
865static bool connection_established(struct drbd_connection *connection,
866 struct socket **sock1,
867 struct socket **sock2)
868{
869 struct net_conf *nc;
870 int timeout;
871 bool ok;
872
873 if (!*sock1 || !*sock2)
874 return false;
875
876 rcu_read_lock();
877 nc = rcu_dereference(connection->net_conf);
878 timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
879 rcu_read_unlock();
880 schedule_timeout_interruptible(timeout);
881
882 ok = drbd_socket_okay(sock1);
883 ok = drbd_socket_okay(sock2) && ok;
884
885 return ok;
886}
887
2325eb66
PR
888/* Gets called if a connection is established, or if a new minor gets created
889 in a connection */
69a22773 890int drbd_connected(struct drbd_peer_device *peer_device)
907599e0 891{
69a22773 892 struct drbd_device *device = peer_device->device;
0829f5ed 893 int err;
907599e0 894
b30ab791
AG
895 atomic_set(&device->packet_seq, 0);
896 device->peer_seq = 0;
907599e0 897
69a22773
AG
898 device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
899 &peer_device->connection->cstate_mutex :
b30ab791 900 &device->own_state_mutex;
8410da8f 901
69a22773 902 err = drbd_send_sync_param(peer_device);
0829f5ed 903 if (!err)
69a22773 904 err = drbd_send_sizes(peer_device, 0, 0);
0829f5ed 905 if (!err)
69a22773 906 err = drbd_send_uuids(peer_device);
0829f5ed 907 if (!err)
69a22773 908 err = drbd_send_current_state(peer_device);
b30ab791
AG
909 clear_bit(USE_DEGR_WFC_T, &device->flags);
910 clear_bit(RESIZE_PENDING, &device->flags);
911 atomic_set(&device->ap_in_flight, 0);
912 mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
0829f5ed 913 return err;
907599e0 914}
b411b363
PR
915
916/*
917 * return values:
918 * 1 yes, we have a valid connection
919 * 0 oops, did not work out, please try again
920 * -1 peer talks different language,
921 * no point in trying again, please go standalone.
922 * -2 We do not have a network config...
923 */
bde89a9e 924static int conn_connect(struct drbd_connection *connection)
b411b363 925{
7da35862 926 struct drbd_socket sock, msock;
c06ece6b 927 struct drbd_peer_device *peer_device;
44ed167d 928 struct net_conf *nc;
5d0b17f1
PR
929 int vnr, timeout, h;
930 bool discard_my_data, ok;
197296ff 931 enum drbd_state_rv rv;
7a426fd8 932 struct accept_wait_data ad = {
bde89a9e 933 .connection = connection,
7a426fd8
PR
934 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
935 };
b411b363 936
bde89a9e
AG
937 clear_bit(DISCONNECT_SENT, &connection->flags);
938 if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
939 return -2;
940
7da35862 941 mutex_init(&sock.mutex);
bde89a9e
AG
942 sock.sbuf = connection->data.sbuf;
943 sock.rbuf = connection->data.rbuf;
7da35862
PR
944 sock.socket = NULL;
945 mutex_init(&msock.mutex);
bde89a9e
AG
946 msock.sbuf = connection->meta.sbuf;
947 msock.rbuf = connection->meta.rbuf;
7da35862
PR
948 msock.socket = NULL;
949
0916e0e3 950 /* Assume that the peer only understands protocol 80 until we know better. */
bde89a9e 951 connection->agreed_pro_version = 80;
b411b363 952
bde89a9e 953 if (prepare_listen_socket(connection, &ad))
7a426fd8 954 return 0;
b411b363
PR
955
956 do {
2bf89621 957 struct socket *s;
b411b363 958
bde89a9e 959 s = drbd_try_connect(connection);
b411b363 960 if (s) {
7da35862
PR
961 if (!sock.socket) {
962 sock.socket = s;
bde89a9e 963 send_first_packet(connection, &sock, P_INITIAL_DATA);
7da35862 964 } else if (!msock.socket) {
bde89a9e 965 clear_bit(RESOLVE_CONFLICTS, &connection->flags);
7da35862 966 msock.socket = s;
bde89a9e 967 send_first_packet(connection, &msock, P_INITIAL_META);
b411b363 968 } else {
1ec861eb 969 drbd_err(connection, "Logic error in conn_connect()\n");
b411b363
PR
970 goto out_release_sockets;
971 }
972 }
973
5d0b17f1
PR
974 if (connection_established(connection, &sock.socket, &msock.socket))
975 break;
b411b363
PR
976
977retry:
bde89a9e 978 s = drbd_wait_for_connect(connection, &ad);
b411b363 979 if (s) {
bde89a9e 980 int fp = receive_first_packet(connection, s);
7da35862
PR
981 drbd_socket_okay(&sock.socket);
982 drbd_socket_okay(&msock.socket);
92f14951 983 switch (fp) {
e5d6f33a 984 case P_INITIAL_DATA:
7da35862 985 if (sock.socket) {
1ec861eb 986 drbd_warn(connection, "initial packet S crossed\n");
7da35862 987 sock_release(sock.socket);
80c6eed4
PR
988 sock.socket = s;
989 goto randomize;
b411b363 990 }
7da35862 991 sock.socket = s;
b411b363 992 break;
e5d6f33a 993 case P_INITIAL_META:
bde89a9e 994 set_bit(RESOLVE_CONFLICTS, &connection->flags);
7da35862 995 if (msock.socket) {
1ec861eb 996 drbd_warn(connection, "initial packet M crossed\n");
7da35862 997 sock_release(msock.socket);
80c6eed4
PR
998 msock.socket = s;
999 goto randomize;
b411b363 1000 }
7da35862 1001 msock.socket = s;
b411b363
PR
1002 break;
1003 default:
1ec861eb 1004 drbd_warn(connection, "Error receiving initial packet\n");
b411b363 1005 sock_release(s);
80c6eed4 1006randomize:
8032bf12 1007 if (get_random_u32_below(2))
b411b363
PR
1008 goto retry;
1009 }
1010 }
1011
bde89a9e 1012 if (connection->cstate <= C_DISCONNECTING)
b411b363
PR
1013 goto out_release_sockets;
1014 if (signal_pending(current)) {
1015 flush_signals(current);
1016 smp_rmb();
bde89a9e 1017 if (get_t_state(&connection->receiver) == EXITING)
b411b363
PR
1018 goto out_release_sockets;
1019 }
1020
5d0b17f1 1021 ok = connection_established(connection, &sock.socket, &msock.socket);
b666dbf8 1022 } while (!ok);
b411b363 1023
7a426fd8
PR
1024 if (ad.s_listen)
1025 sock_release(ad.s_listen);
b411b363 1026
98683650
PR
1027 sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1028 msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
b411b363 1029
7da35862
PR
1030 sock.socket->sk->sk_allocation = GFP_NOIO;
1031 msock.socket->sk->sk_allocation = GFP_NOIO;
b411b363 1032
98123866
BC
1033 sock.socket->sk->sk_use_task_frag = false;
1034 msock.socket->sk->sk_use_task_frag = false;
1035
7da35862
PR
1036 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1037 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
b411b363 1038
b411b363 1039 /* NOT YET ...
bde89a9e 1040 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
7da35862 1041 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
6038178e 1042 * first set it to the P_CONNECTION_FEATURES timeout,
b411b363 1043 * which we set to 4x the configured ping_timeout. */
44ed167d 1044 rcu_read_lock();
bde89a9e 1045 nc = rcu_dereference(connection->net_conf);
44ed167d 1046
7da35862
PR
1047 sock.socket->sk->sk_sndtimeo =
1048 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
b411b363 1049
7da35862 1050 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
44ed167d 1051 timeout = nc->timeout * HZ / 10;
08b165ba 1052 discard_my_data = nc->discard_my_data;
44ed167d 1053 rcu_read_unlock();
b411b363 1054
7da35862 1055 msock.socket->sk->sk_sndtimeo = timeout;
b411b363
PR
1056
1057 /* we don't want delays.
25985edc 1058 * we use TCP_CORK where appropriate, though */
12abc5ee
CH
1059 tcp_sock_set_nodelay(sock.socket->sk);
1060 tcp_sock_set_nodelay(msock.socket->sk);
b411b363 1061
bde89a9e
AG
1062 connection->data.socket = sock.socket;
1063 connection->meta.socket = msock.socket;
1064 connection->last_received = jiffies;
b411b363 1065
bde89a9e 1066 h = drbd_do_features(connection);
b411b363
PR
1067 if (h <= 0)
1068 return h;
1069
bde89a9e 1070 if (connection->cram_hmac_tfm) {
b30ab791 1071 /* drbd_request_state(device, NS(conn, WFAuth)); */
bde89a9e 1072 switch (drbd_do_auth(connection)) {
b10d96cb 1073 case -1:
1ec861eb 1074 drbd_err(connection, "Authentication of peer failed\n");
b411b363 1075 return -1;
b10d96cb 1076 case 0:
1ec861eb 1077 drbd_err(connection, "Authentication of peer failed, trying again.\n");
b10d96cb 1078 return 0;
b411b363
PR
1079 }
1080 }
1081
bde89a9e
AG
1082 connection->data.socket->sk->sk_sndtimeo = timeout;
1083 connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
b411b363 1084
bde89a9e 1085 if (drbd_send_protocol(connection) == -EOPNOTSUPP)
7e2455c1 1086 return -1;
b411b363 1087
31007745
PR
1088 /* Prevent a race between resync-handshake and
1089 * being promoted to Primary.
1090 *
1091 * Grab and release the state mutex, so we know that any current
1092 * drbd_set_role() is finished, and any incoming drbd_set_role
1093 * will see the STATE_SENT flag, and wait for it to be cleared.
1094 */
1095 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1096 mutex_lock(peer_device->device->state_mutex);
1097
cde81d99
LE
1098 /* avoid a race with conn_request_state( C_DISCONNECTING ) */
1099 spin_lock_irq(&connection->resource->req_lock);
bde89a9e 1100 set_bit(STATE_SENT, &connection->flags);
cde81d99 1101 spin_unlock_irq(&connection->resource->req_lock);
a1096a6e 1102
31007745
PR
1103 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1104 mutex_unlock(peer_device->device->state_mutex);
1105
c141ebda 1106 rcu_read_lock();
c06ece6b
AG
1107 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1108 struct drbd_device *device = peer_device->device;
b30ab791 1109 kref_get(&device->kref);
26ea8f92
AG
1110 rcu_read_unlock();
1111
08b165ba 1112 if (discard_my_data)
b30ab791 1113 set_bit(DISCARD_MY_DATA, &device->flags);
08b165ba 1114 else
b30ab791 1115 clear_bit(DISCARD_MY_DATA, &device->flags);
08b165ba 1116
69a22773 1117 drbd_connected(peer_device);
05a10ec7 1118 kref_put(&device->kref, drbd_destroy_device);
c141ebda
PR
1119 rcu_read_lock();
1120 }
1121 rcu_read_unlock();
1122
bde89a9e
AG
1123 rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1124 if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1125 clear_bit(STATE_SENT, &connection->flags);
1e86ac48 1126 return 0;
a1096a6e 1127 }
1e86ac48 1128
1c03e520 1129 drbd_thread_start(&connection->ack_receiver);
39e91a60
LE
1130 /* opencoded create_singlethread_workqueue(),
1131 * to be able to use format string arguments */
1132 connection->ack_sender =
1133 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
668700b4
PR
1134 if (!connection->ack_sender) {
1135 drbd_err(connection, "Failed to create workqueue ack_sender\n");
1136 return 0;
1137 }
b411b363 1138
0500813f 1139 mutex_lock(&connection->resource->conf_update);
08b165ba
PR
1140 /* The discard_my_data flag is a single-shot modifier to the next
1141 * connection attempt, the handshake of which is now well underway.
1142 * No need for rcu style copying of the whole struct
1143 * just to clear a single value. */
bde89a9e 1144 connection->net_conf->discard_my_data = 0;
0500813f 1145 mutex_unlock(&connection->resource->conf_update);
08b165ba 1146
d3fcb490 1147 return h;
b411b363
PR
1148
1149out_release_sockets:
7a426fd8
PR
1150 if (ad.s_listen)
1151 sock_release(ad.s_listen);
7da35862
PR
1152 if (sock.socket)
1153 sock_release(sock.socket);
1154 if (msock.socket)
1155 sock_release(msock.socket);
b411b363
PR
1156 return -1;
1157}
1158
bde89a9e 1159static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
b411b363 1160{
bde89a9e 1161 unsigned int header_size = drbd_header_size(connection);
e658983a 1162
0c8e36d9
AG
1163 if (header_size == sizeof(struct p_header100) &&
1164 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1165 struct p_header100 *h = header;
1166 if (h->pad != 0) {
1ec861eb 1167 drbd_err(connection, "Header padding is not zero\n");
0c8e36d9
AG
1168 return -EINVAL;
1169 }
1170 pi->vnr = be16_to_cpu(h->volume);
1171 pi->cmd = be16_to_cpu(h->command);
1172 pi->size = be32_to_cpu(h->length);
1173 } else if (header_size == sizeof(struct p_header95) &&
1174 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
e658983a 1175 struct p_header95 *h = header;
e658983a 1176 pi->cmd = be16_to_cpu(h->command);
b55d84ba
AG
1177 pi->size = be32_to_cpu(h->length);
1178 pi->vnr = 0;
e658983a
AG
1179 } else if (header_size == sizeof(struct p_header80) &&
1180 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1181 struct p_header80 *h = header;
1182 pi->cmd = be16_to_cpu(h->command);
1183 pi->size = be16_to_cpu(h->length);
77351055 1184 pi->vnr = 0;
02918be2 1185 } else {
1ec861eb 1186 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
e658983a 1187 be32_to_cpu(*(__be32 *)header),
bde89a9e 1188 connection->agreed_pro_version);
8172f3e9 1189 return -EINVAL;
b411b363 1190 }
e658983a 1191 pi->data = header + header_size;
8172f3e9 1192 return 0;
257d0af6 1193}
b411b363 1194
c51a0ef3
LE
1195static void drbd_unplug_all_devices(struct drbd_connection *connection)
1196{
1197 if (current->plug == &connection->receiver_plug) {
1198 blk_finish_plug(&connection->receiver_plug);
1199 blk_start_plug(&connection->receiver_plug);
1200 } /* else: maybe just schedule() ?? */
1201}
1202
bde89a9e 1203static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
257d0af6 1204{
bde89a9e 1205 void *buffer = connection->data.rbuf;
69bc7bc3 1206 int err;
257d0af6 1207
bde89a9e 1208 err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
a5c31904 1209 if (err)
69bc7bc3 1210 return err;
257d0af6 1211
bde89a9e
AG
1212 err = decode_header(connection, buffer, pi);
1213 connection->last_received = jiffies;
b411b363 1214
69bc7bc3 1215 return err;
b411b363
PR
1216}
1217
c51a0ef3
LE
1218static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi)
1219{
1220 void *buffer = connection->data.rbuf;
1221 unsigned int size = drbd_header_size(connection);
1222 int err;
1223
1224 err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT);
1225 if (err != size) {
1226 /* If we have nothing in the receive buffer now, to reduce
1227 * application latency, try to drain the backend queues as
1228 * quickly as possible, and let remote TCP know what we have
1229 * received so far. */
1230 if (err == -EAGAIN) {
ddd061b8 1231 tcp_sock_set_quickack(connection->data.socket->sk, 2);
c51a0ef3
LE
1232 drbd_unplug_all_devices(connection);
1233 }
1234 if (err > 0) {
1235 buffer += err;
1236 size -= err;
1237 }
1238 err = drbd_recv_all_warn(connection, buffer, size);
1239 if (err)
1240 return err;
1241 }
1242
1243 err = decode_header(connection, connection->data.rbuf, pi);
1244 connection->last_received = jiffies;
1245
1246 return err;
1247}
f9ff0da5
LE
1248/* This is blkdev_issue_flush, but asynchronous.
1249 * We want to submit to all component volumes in parallel,
1250 * then wait for all completions.
1251 */
1252struct issue_flush_context {
1253 atomic_t pending;
1254 int error;
1255 struct completion done;
1256};
1257struct one_flush_context {
1258 struct drbd_device *device;
1259 struct issue_flush_context *ctx;
1260};
1261
1ffa7bfa 1262static void one_flush_endio(struct bio *bio)
b411b363 1263{
f9ff0da5
LE
1264 struct one_flush_context *octx = bio->bi_private;
1265 struct drbd_device *device = octx->device;
1266 struct issue_flush_context *ctx = octx->ctx;
1267
4e4cbee9
CH
1268 if (bio->bi_status) {
1269 ctx->error = blk_status_to_errno(bio->bi_status);
1270 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
f9ff0da5
LE
1271 }
1272 kfree(octx);
1273 bio_put(bio);
1274
1275 clear_bit(FLUSH_PENDING, &device->flags);
1276 put_ldev(device);
1277 kref_put(&device->kref, drbd_destroy_device);
1278
1279 if (atomic_dec_and_test(&ctx->pending))
1280 complete(&ctx->done);
1281}
1282
1283static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1284{
07888c66
CH
1285 struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0,
1286 REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO);
f9ff0da5 1287 struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
4b1dc86d
CH
1288
1289 if (!octx) {
1290 drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n");
f9ff0da5
LE
1291 /* FIXME: what else can I do now? disconnecting or detaching
1292 * really does not help to improve the state of the world, either.
1293 */
4b1dc86d 1294 bio_put(bio);
f9ff0da5
LE
1295
1296 ctx->error = -ENOMEM;
1297 put_ldev(device);
1298 kref_put(&device->kref, drbd_destroy_device);
1299 return;
1300 }
4b0007c0 1301
f9ff0da5
LE
1302 octx->device = device;
1303 octx->ctx = ctx;
f9ff0da5
LE
1304 bio->bi_private = octx;
1305 bio->bi_end_io = one_flush_endio;
f9ff0da5
LE
1306
1307 device->flush_jif = jiffies;
1308 set_bit(FLUSH_PENDING, &device->flags);
1309 atomic_inc(&ctx->pending);
1310 submit_bio(bio);
1311}
1312
1313static void drbd_flush(struct drbd_connection *connection)
1314{
f6ba8636 1315 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
f9ff0da5
LE
1316 struct drbd_peer_device *peer_device;
1317 struct issue_flush_context ctx;
1318 int vnr;
1319
1320 atomic_set(&ctx.pending, 1);
1321 ctx.error = 0;
1322 init_completion(&ctx.done);
1323
615e087f 1324 rcu_read_lock();
c06ece6b
AG
1325 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1326 struct drbd_device *device = peer_device->device;
1327
b30ab791 1328 if (!get_ldev(device))
615e087f 1329 continue;
b30ab791 1330 kref_get(&device->kref);
615e087f
LE
1331 rcu_read_unlock();
1332
f9ff0da5 1333 submit_one_flush(device, &ctx);
b411b363 1334
615e087f 1335 rcu_read_lock();
b411b363 1336 }
615e087f 1337 rcu_read_unlock();
f9ff0da5
LE
1338
1339 /* Do we want to add a timeout,
1340 * if disk-timeout is set? */
1341 if (!atomic_dec_and_test(&ctx.pending))
1342 wait_for_completion(&ctx.done);
1343
1344 if (ctx.error) {
1345 /* would rather check on EOPNOTSUPP, but that is not reliable.
1346 * don't try again for ANY return value != 0
1347 * if (rv == -EOPNOTSUPP) */
1348 /* Any error is already reported by bio_endio callback. */
1349 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1350 }
b411b363 1351 }
b411b363
PR
1352}
1353
1354/**
1355 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
9b48ff07 1356 * @connection: DRBD connection.
b411b363
PR
1357 * @epoch: Epoch object.
1358 * @ev: Epoch event.
1359 */
bde89a9e 1360static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
b411b363
PR
1361 struct drbd_epoch *epoch,
1362 enum epoch_event ev)
1363{
2451fc3b 1364 int epoch_size;
b411b363 1365 struct drbd_epoch *next_epoch;
b411b363
PR
1366 enum finish_epoch rv = FE_STILL_LIVE;
1367
bde89a9e 1368 spin_lock(&connection->epoch_lock);
b411b363
PR
1369 do {
1370 next_epoch = NULL;
b411b363
PR
1371
1372 epoch_size = atomic_read(&epoch->epoch_size);
1373
1374 switch (ev & ~EV_CLEANUP) {
1375 case EV_PUT:
1376 atomic_dec(&epoch->active);
1377 break;
1378 case EV_GOT_BARRIER_NR:
1379 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
b411b363
PR
1380 break;
1381 case EV_BECAME_LAST:
1382 /* nothing to do*/
1383 break;
1384 }
1385
b411b363
PR
1386 if (epoch_size != 0 &&
1387 atomic_read(&epoch->active) == 0 &&
80f9fd55 1388 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
b411b363 1389 if (!(ev & EV_CLEANUP)) {
bde89a9e
AG
1390 spin_unlock(&connection->epoch_lock);
1391 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1392 spin_lock(&connection->epoch_lock);
b411b363 1393 }
9ed57dcb
LE
1394#if 0
1395 /* FIXME: dec unacked on connection, once we have
1396 * something to count pending connection packets in. */
80f9fd55 1397 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
bde89a9e 1398 dec_unacked(epoch->connection);
9ed57dcb 1399#endif
b411b363 1400
bde89a9e 1401 if (connection->current_epoch != epoch) {
b411b363
PR
1402 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1403 list_del(&epoch->list);
1404 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
bde89a9e 1405 connection->epochs--;
b411b363
PR
1406 kfree(epoch);
1407
1408 if (rv == FE_STILL_LIVE)
1409 rv = FE_DESTROYED;
1410 } else {
1411 epoch->flags = 0;
1412 atomic_set(&epoch->epoch_size, 0);
698f9315 1413 /* atomic_set(&epoch->active, 0); is already zero */
b411b363
PR
1414 if (rv == FE_STILL_LIVE)
1415 rv = FE_RECYCLED;
1416 }
1417 }
1418
1419 if (!next_epoch)
1420 break;
1421
1422 epoch = next_epoch;
1423 } while (1);
1424
bde89a9e 1425 spin_unlock(&connection->epoch_lock);
b411b363 1426
b411b363
PR
1427 return rv;
1428}
1429
8fe39aac
PR
1430static enum write_ordering_e
1431max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1432{
1433 struct disk_conf *dc;
1434
1435 dc = rcu_dereference(bdev->disk_conf);
1436
f6ba8636
AG
1437 if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
1438 wo = WO_DRAIN_IO;
1439 if (wo == WO_DRAIN_IO && !dc->disk_drain)
1440 wo = WO_NONE;
8fe39aac
PR
1441
1442 return wo;
1443}
1444
9b48ff07 1445/*
b411b363 1446 * drbd_bump_write_ordering() - Fall back to an other write ordering method
b411b363
PR
1447 * @wo: Write ordering method to try.
1448 */
8fe39aac
PR
1449void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1450 enum write_ordering_e wo)
b411b363 1451{
e9526580 1452 struct drbd_device *device;
b411b363 1453 enum write_ordering_e pwo;
4b0007c0 1454 int vnr;
b411b363 1455 static char *write_ordering_str[] = {
f6ba8636
AG
1456 [WO_NONE] = "none",
1457 [WO_DRAIN_IO] = "drain",
1458 [WO_BDEV_FLUSH] = "flush",
b411b363
PR
1459 };
1460
e9526580 1461 pwo = resource->write_ordering;
f6ba8636 1462 if (wo != WO_BDEV_FLUSH)
70df7092 1463 wo = min(pwo, wo);
daeda1cc 1464 rcu_read_lock();
e9526580 1465 idr_for_each_entry(&resource->devices, device, vnr) {
8fe39aac
PR
1466 if (get_ldev(device)) {
1467 wo = max_allowed_wo(device->ldev, wo);
1468 if (device->ldev == bdev)
1469 bdev = NULL;
1470 put_ldev(device);
1471 }
4b0007c0 1472 }
8fe39aac
PR
1473
1474 if (bdev)
1475 wo = max_allowed_wo(bdev, wo);
1476
70df7092
LE
1477 rcu_read_unlock();
1478
e9526580 1479 resource->write_ordering = wo;
f6ba8636 1480 if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
e9526580 1481 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
b411b363
PR
1482}
1483
f31e583a
LE
1484/*
1485 * Mapping "discard" to ZEROOUT with UNMAP does not work for us:
1486 * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it
1487 * will directly go to fallback mode, submitting normal writes, and
1488 * never even try to UNMAP.
1489 *
1490 * And dm-thin does not do this (yet), mostly because in general it has
1491 * to assume that "skip_block_zeroing" is set. See also:
1492 * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
1493 * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html
1494 *
1495 * We *may* ignore the discard-zeroes-data setting, if so configured.
1496 *
1497 * Assumption is that this "discard_zeroes_data=0" is only because the backend
1498 * may ignore partial unaligned discards.
1499 *
1500 * LVM/DM thin as of at least
1501 * LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
1502 * Library version: 1.02.93-RHEL7 (2015-01-28)
1503 * Driver version: 4.29.0
1504 * still behaves this way.
1505 *
1506 * For unaligned (wrt. alignment and granularity) or too small discards,
1507 * we zero-out the initial (and/or) trailing unaligned partial chunks,
1508 * but discard all the aligned full chunks.
1509 *
1510 * At least for LVM/DM thin, with skip_block_zeroing=false,
1511 * the result is effectively "discard_zeroes_data=1".
1512 */
1513/* flags: EE_TRIM|EE_ZEROOUT */
1514int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags)
dd4f699d
LE
1515{
1516 struct block_device *bdev = device->ldev->backing_bdev;
f31e583a
LE
1517 sector_t tmp, nr;
1518 unsigned int max_discard_sectors, granularity;
1519 int alignment;
1520 int err = 0;
dd4f699d 1521
f31e583a
LE
1522 if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM))
1523 goto zero_out;
1524
1525 /* Zero-sector (unknown) and one-sector granularities are the same. */
7b47ef52 1526 granularity = max(bdev_discard_granularity(bdev) >> 9, 1U);
f31e583a
LE
1527 alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
1528
cf0fbf89 1529 max_discard_sectors = min(bdev_max_discard_sectors(bdev), (1U << 22));
f31e583a
LE
1530 max_discard_sectors -= max_discard_sectors % granularity;
1531 if (unlikely(!max_discard_sectors))
1532 goto zero_out;
1533
1534 if (nr_sectors < granularity)
1535 goto zero_out;
1536
1537 tmp = start;
1538 if (sector_div(tmp, granularity) != alignment) {
1539 if (nr_sectors < 2*granularity)
1540 goto zero_out;
1541 /* start + gran - (start + gran - align) % gran */
1542 tmp = start + granularity - alignment;
1543 tmp = start + granularity - sector_div(tmp, granularity);
1544
1545 nr = tmp - start;
1546 /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many
1547 * layers are below us, some may have smaller granularity */
1548 err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
1549 nr_sectors -= nr;
1550 start = tmp;
1551 }
1552 while (nr_sectors >= max_discard_sectors) {
44abff2c
CH
1553 err |= blkdev_issue_discard(bdev, start, max_discard_sectors,
1554 GFP_NOIO);
f31e583a
LE
1555 nr_sectors -= max_discard_sectors;
1556 start += max_discard_sectors;
1557 }
1558 if (nr_sectors) {
1559 /* max_discard_sectors is unsigned int (and a multiple of
1560 * granularity, we made sure of that above already);
1561 * nr is < max_discard_sectors;
1562 * I don't need sector_div here, even though nr is sector_t */
1563 nr = nr_sectors;
1564 nr -= (unsigned int)nr % granularity;
1565 if (nr) {
44abff2c 1566 err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO);
f31e583a
LE
1567 nr_sectors -= nr;
1568 start += nr;
1569 }
1570 }
1571 zero_out:
1572 if (nr_sectors) {
1573 err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
1574 (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP);
1575 }
1576 return err != 0;
1577}
0dbed96a 1578
f31e583a
LE
1579static bool can_do_reliable_discards(struct drbd_device *device)
1580{
f31e583a
LE
1581 struct disk_conf *dc;
1582 bool can_do;
0dbed96a 1583
70200574 1584 if (!bdev_max_discard_sectors(device->ldev->backing_bdev))
f31e583a
LE
1585 return false;
1586
1587 rcu_read_lock();
1588 dc = rcu_dereference(device->ldev->disk_conf);
1589 can_do = dc->discard_zeroes_if_aligned;
1590 rcu_read_unlock();
1591 return can_do;
1592}
1593
1594static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req)
1595{
1596 /* If the backend cannot discard, or does not guarantee
1597 * read-back zeroes in discarded ranges, we fall back to
1598 * zero-out. Unless configuration specifically requested
1599 * otherwise. */
1600 if (!can_do_reliable_discards(device))
1601 peer_req->flags |= EE_ZEROOUT;
1602
1603 if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
1604 peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM)))
1605 peer_req->flags |= EE_WAS_ERROR;
dd4f699d
LE
1606 drbd_endio_write_sec_final(peer_req);
1607}
1608
ce668b6d
CB
1609static int peer_request_fault_type(struct drbd_peer_request *peer_req)
1610{
1611 if (peer_req_op(peer_req) == REQ_OP_READ) {
1612 return peer_req->flags & EE_APPLICATION ?
1613 DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD;
1614 } else {
1615 return peer_req->flags & EE_APPLICATION ?
1616 DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR;
1617 }
1618}
1619
a34592ff 1620/**
fbe29dec 1621 * drbd_submit_peer_request()
db830c46 1622 * @peer_req: peer request
10f6d992
LE
1623 *
1624 * May spread the pages to multiple bios,
1625 * depending on bio_add_page restrictions.
1626 *
1627 * Returns 0 if all bios have been submitted,
1628 * -ENOMEM if we could not allocate enough bios,
1629 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1630 * single page to an empty bio (which should never happen and likely indicates
1631 * that the lower level IO stack is in some way broken). This has been observed
1632 * on certain Xen deployments.
45bb912b
LE
1633 */
1634/* TODO allocate from our own bio_set. */
ce668b6d 1635int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
45bb912b 1636{
ce668b6d 1637 struct drbd_device *device = peer_req->peer_device->device;
45bb912b
LE
1638 struct bio *bios = NULL;
1639 struct bio *bio;
db830c46
AG
1640 struct page *page = peer_req->pages;
1641 sector_t sector = peer_req->i.sector;
e6be38a1
CH
1642 unsigned int data_size = peer_req->i.size;
1643 unsigned int n_bios = 0;
1644 unsigned int nr_pages = PFN_UP(data_size);
45bb912b 1645
dd4f699d
LE
1646 /* TRIM/DISCARD: for now, always use the helper function
1647 * blkdev_issue_zeroout(..., discard=true).
1648 * It's synchronous, but it does the right thing wrt. bio splitting.
1649 * Correctness first, performance later. Next step is to code an
1650 * asynchronous variant of the same.
1651 */
a34592ff 1652 if (peer_req->flags & (EE_TRIM | EE_ZEROOUT)) {
a0fb3c47
LE
1653 /* wait for all pending IO completions, before we start
1654 * zeroing things out. */
5dd2ca19 1655 conn_wait_active_ee_empty(peer_req->peer_device->connection);
45d2933c
LE
1656 /* add it to the active list now,
1657 * so we can find it to present it in debugfs */
21ae5d7f
LE
1658 peer_req->submit_jif = jiffies;
1659 peer_req->flags |= EE_SUBMITTED;
700ca8c0
PR
1660
1661 /* If this was a resync request from receive_rs_deallocated(),
1662 * it is already on the sync_ee list */
1663 if (list_empty(&peer_req->w.list)) {
1664 spin_lock_irq(&device->resource->req_lock);
1665 list_add_tail(&peer_req->w.list, &device->active_ee);
1666 spin_unlock_irq(&device->resource->req_lock);
1667 }
1668
a34592ff 1669 drbd_issue_peer_discard_or_zero_out(device, peer_req);
a0fb3c47
LE
1670 return 0;
1671 }
1672
45bb912b
LE
1673 /* In most cases, we will only need one bio. But in case the lower
1674 * level restrictions happen to be different at this offset on this
1675 * side than those of the sending peer, we may need to submit the
9476f39d
LE
1676 * request in more than one bio.
1677 *
1678 * Plain bio_alloc is good enough here, this is no DRBD internally
1679 * generated bio, but a bio allocated on behalf of the peer.
1680 */
45bb912b 1681next_bio:
ce668b6d
CB
1682 /* _DISCARD, _WRITE_ZEROES handled above.
1683 * REQ_OP_FLUSH (empty flush) not expected,
1684 * should have been mapped to a "drbd protocol barrier".
1685 * REQ_OP_SECURE_ERASE: I don't see how we could ever support that.
1686 */
1687 if (!(peer_req_op(peer_req) == REQ_OP_WRITE ||
1688 peer_req_op(peer_req) == REQ_OP_READ)) {
1689 drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf);
1690 return -EINVAL;
1691 }
1692
1693 bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO);
db830c46 1694 /* > peer_req->i.sector, unless this is the first bio */
4f024f37 1695 bio->bi_iter.bi_sector = sector;
db830c46 1696 bio->bi_private = peer_req;
fcefa62e 1697 bio->bi_end_io = drbd_peer_request_endio;
45bb912b
LE
1698
1699 bio->bi_next = bios;
1700 bios = bio;
1701 ++n_bios;
1702
1703 page_chain_for_each(page) {
11f8b2b6 1704 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
06efffda 1705 if (!bio_add_page(bio, page, len, 0))
45bb912b 1706 goto next_bio;
11f8b2b6 1707 data_size -= len;
45bb912b
LE
1708 sector += len >> 9;
1709 --nr_pages;
1710 }
11f8b2b6 1711 D_ASSERT(device, data_size == 0);
a0fb3c47 1712 D_ASSERT(device, page == NULL);
45bb912b 1713
db830c46 1714 atomic_set(&peer_req->pending_bios, n_bios);
21ae5d7f
LE
1715 /* for debugfs: update timestamp, mark as submitted */
1716 peer_req->submit_jif = jiffies;
1717 peer_req->flags |= EE_SUBMITTED;
45bb912b
LE
1718 do {
1719 bio = bios;
1720 bios = bios->bi_next;
1721 bio->bi_next = NULL;
1722
ce668b6d 1723 drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio);
45bb912b 1724 } while (bios);
45bb912b 1725 return 0;
45bb912b
LE
1726}
1727
b30ab791 1728static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
db830c46 1729 struct drbd_peer_request *peer_req)
53840641 1730{
db830c46 1731 struct drbd_interval *i = &peer_req->i;
53840641 1732
b30ab791 1733 drbd_remove_interval(&device->write_requests, i);
53840641
AG
1734 drbd_clear_interval(i);
1735
6c852bec 1736 /* Wake up any processes waiting for this peer request to complete. */
53840641 1737 if (i->waiting)
b30ab791 1738 wake_up(&device->misc_wait);
53840641
AG
1739}
1740
bde89a9e 1741static void conn_wait_active_ee_empty(struct drbd_connection *connection)
77fede51 1742{
c06ece6b 1743 struct drbd_peer_device *peer_device;
77fede51
PR
1744 int vnr;
1745
1746 rcu_read_lock();
c06ece6b
AG
1747 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1748 struct drbd_device *device = peer_device->device;
1749
b30ab791 1750 kref_get(&device->kref);
77fede51 1751 rcu_read_unlock();
b30ab791 1752 drbd_wait_ee_list_empty(device, &device->active_ee);
05a10ec7 1753 kref_put(&device->kref, drbd_destroy_device);
77fede51
PR
1754 rcu_read_lock();
1755 }
1756 rcu_read_unlock();
1757}
1758
bde89a9e 1759static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
b411b363 1760{
2451fc3b 1761 int rv;
e658983a 1762 struct p_barrier *p = pi->data;
b411b363
PR
1763 struct drbd_epoch *epoch;
1764
9ed57dcb
LE
1765 /* FIXME these are unacked on connection,
1766 * not a specific (peer)device.
1767 */
bde89a9e
AG
1768 connection->current_epoch->barrier_nr = p->barrier;
1769 connection->current_epoch->connection = connection;
1770 rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
b411b363
PR
1771
1772 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1773 * the activity log, which means it would not be resynced in case the
1774 * R_PRIMARY crashes now.
1775 * Therefore we must send the barrier_ack after the barrier request was
1776 * completed. */
e9526580 1777 switch (connection->resource->write_ordering) {
f6ba8636 1778 case WO_NONE:
b411b363 1779 if (rv == FE_RECYCLED)
82bc0194 1780 return 0;
2451fc3b
PR
1781
1782 /* receiver context, in the writeout path of the other node.
1783 * avoid potential distributed deadlock */
1784 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1785 if (epoch)
1786 break;
1787 else
1ec861eb 1788 drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
df561f66 1789 fallthrough;
b411b363 1790
f6ba8636
AG
1791 case WO_BDEV_FLUSH:
1792 case WO_DRAIN_IO:
bde89a9e
AG
1793 conn_wait_active_ee_empty(connection);
1794 drbd_flush(connection);
2451fc3b 1795
bde89a9e 1796 if (atomic_read(&connection->current_epoch->epoch_size)) {
2451fc3b
PR
1797 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1798 if (epoch)
1799 break;
b411b363
PR
1800 }
1801
82bc0194 1802 return 0;
2451fc3b 1803 default:
e9526580
PR
1804 drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1805 connection->resource->write_ordering);
82bc0194 1806 return -EIO;
b411b363
PR
1807 }
1808
1809 epoch->flags = 0;
1810 atomic_set(&epoch->epoch_size, 0);
1811 atomic_set(&epoch->active, 0);
1812
bde89a9e
AG
1813 spin_lock(&connection->epoch_lock);
1814 if (atomic_read(&connection->current_epoch->epoch_size)) {
1815 list_add(&epoch->list, &connection->current_epoch->list);
1816 connection->current_epoch = epoch;
1817 connection->epochs++;
b411b363
PR
1818 } else {
1819 /* The current_epoch got recycled while we allocated this one... */
1820 kfree(epoch);
1821 }
bde89a9e 1822 spin_unlock(&connection->epoch_lock);
b411b363 1823
82bc0194 1824 return 0;
b411b363
PR
1825}
1826
9104d31a 1827/* quick wrapper in case payload size != request_size (write same) */
3d0e6375 1828static void drbd_csum_ee_size(struct crypto_shash *h,
9104d31a
LE
1829 struct drbd_peer_request *r, void *d,
1830 unsigned int payload_size)
1831{
1832 unsigned int tmp = r->i.size;
1833 r->i.size = payload_size;
1834 drbd_csum_ee(h, r, d);
1835 r->i.size = tmp;
1836}
1837
b411b363 1838/* used from receive_RSDataReply (recv_resync_read)
9104d31a
LE
1839 * and from receive_Data.
1840 * data_size: actual payload ("data in")
1841 * for normal writes that is bi_size.
1842 * for discards, that is zero.
1843 * for write same, it is logical_block_size.
1844 * both trim and write same have the bi_size ("data len to be affected")
1845 * as extra argument in the packet header.
1846 */
f6ffca9f 1847static struct drbd_peer_request *
69a22773 1848read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
a0fb3c47 1849 struct packet_info *pi) __must_hold(local)
b411b363 1850{
69a22773 1851 struct drbd_device *device = peer_device->device;
155bd9d1 1852 const sector_t capacity = get_capacity(device->vdisk);
db830c46 1853 struct drbd_peer_request *peer_req;
b411b363 1854 struct page *page;
11f8b2b6
AG
1855 int digest_size, err;
1856 unsigned int data_size = pi->size, ds;
69a22773
AG
1857 void *dig_in = peer_device->connection->int_dig_in;
1858 void *dig_vv = peer_device->connection->int_dig_vv;
6b4388ac 1859 unsigned long *data;
a0fb3c47 1860 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
f31e583a 1861 struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL;
b411b363 1862
11f8b2b6 1863 digest_size = 0;
a0fb3c47 1864 if (!trim && peer_device->connection->peer_integrity_tfm) {
3d0e6375 1865 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
9f5bdc33
AG
1866 /*
1867 * FIXME: Receive the incoming digest into the receive buffer
1868 * here, together with its struct p_data?
1869 */
11f8b2b6 1870 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
a5c31904 1871 if (err)
b411b363 1872 return NULL;
11f8b2b6 1873 data_size -= digest_size;
b411b363
PR
1874 }
1875
a34592ff 1876 /* assume request_size == data_size, but special case trim. */
9104d31a 1877 ds = data_size;
a0fb3c47 1878 if (trim) {
677b3672 1879 if (!expect(peer_device, data_size == 0))
9104d31a
LE
1880 return NULL;
1881 ds = be32_to_cpu(trim->size);
f31e583a 1882 } else if (zeroes) {
677b3672 1883 if (!expect(peer_device, data_size == 0))
f31e583a
LE
1884 return NULL;
1885 ds = be32_to_cpu(zeroes->size);
a0fb3c47
LE
1886 }
1887
677b3672 1888 if (!expect(peer_device, IS_ALIGNED(ds, 512)))
841ce241 1889 return NULL;
a34592ff 1890 if (trim || zeroes) {
677b3672 1891 if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
9104d31a 1892 return NULL;
677b3672 1893 } else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE))
841ce241 1894 return NULL;
b411b363 1895
6666032a
LE
1896 /* even though we trust out peer,
1897 * we sometimes have to double check. */
9104d31a 1898 if (sector + (ds>>9) > capacity) {
d0180171 1899 drbd_err(device, "request from peer beyond end of local disk: "
fdda6544 1900 "capacity: %llus < sector: %llus + size: %u\n",
6666032a 1901 (unsigned long long)capacity,
9104d31a 1902 (unsigned long long)sector, ds);
6666032a
LE
1903 return NULL;
1904 }
1905
b411b363
PR
1906 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1907 * "criss-cross" setup, that might cause write-out on some other DRBD,
1908 * which in turn might block on the other node at this very place. */
9104d31a 1909 peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
db830c46 1910 if (!peer_req)
b411b363 1911 return NULL;
45bb912b 1912
21ae5d7f 1913 peer_req->flags |= EE_WRITE;
9104d31a 1914 if (trim) {
f31e583a
LE
1915 peer_req->flags |= EE_TRIM;
1916 return peer_req;
1917 }
1918 if (zeroes) {
1919 peer_req->flags |= EE_ZEROOUT;
81a3537a 1920 return peer_req;
9104d31a 1921 }
a73ff323 1922
9104d31a 1923 /* receive payload size bytes into page chain */
b411b363 1924 ds = data_size;
db830c46 1925 page = peer_req->pages;
45bb912b
LE
1926 page_chain_for_each(page) {
1927 unsigned len = min_t(int, ds, PAGE_SIZE);
6b4388ac 1928 data = kmap(page);
69a22773 1929 err = drbd_recv_all_warn(peer_device->connection, data, len);
b30ab791 1930 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
d0180171 1931 drbd_err(device, "Fault injection: Corrupting data on receive\n");
6b4388ac
PR
1932 data[0] = data[0] ^ (unsigned long)-1;
1933 }
b411b363 1934 kunmap(page);
a5c31904 1935 if (err) {
b30ab791 1936 drbd_free_peer_req(device, peer_req);
b411b363
PR
1937 return NULL;
1938 }
a5c31904 1939 ds -= len;
b411b363
PR
1940 }
1941
11f8b2b6 1942 if (digest_size) {
9104d31a 1943 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
11f8b2b6 1944 if (memcmp(dig_in, dig_vv, digest_size)) {
d0180171 1945 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
470be44a 1946 (unsigned long long)sector, data_size);
b30ab791 1947 drbd_free_peer_req(device, peer_req);
b411b363
PR
1948 return NULL;
1949 }
1950 }
11f8b2b6 1951 device->recv_cnt += data_size >> 9;
db830c46 1952 return peer_req;
b411b363
PR
1953}
1954
1955/* drbd_drain_block() just takes a data block
1956 * out of the socket input buffer, and discards it.
1957 */
69a22773 1958static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
b411b363
PR
1959{
1960 struct page *page;
a5c31904 1961 int err = 0;
b411b363
PR
1962 void *data;
1963
c3470cde 1964 if (!data_size)
fc5be839 1965 return 0;
c3470cde 1966
69a22773 1967 page = drbd_alloc_pages(peer_device, 1, 1);
b411b363
PR
1968
1969 data = kmap(page);
1970 while (data_size) {
fc5be839
AG
1971 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1972
69a22773 1973 err = drbd_recv_all_warn(peer_device->connection, data, len);
a5c31904 1974 if (err)
b411b363 1975 break;
a5c31904 1976 data_size -= len;
b411b363
PR
1977 }
1978 kunmap(page);
69a22773 1979 drbd_free_pages(peer_device->device, page, 0);
fc5be839 1980 return err;
b411b363
PR
1981}
1982
69a22773 1983static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
b411b363
PR
1984 sector_t sector, int data_size)
1985{
7988613b
KO
1986 struct bio_vec bvec;
1987 struct bvec_iter iter;
b411b363 1988 struct bio *bio;
11f8b2b6 1989 int digest_size, err, expect;
69a22773
AG
1990 void *dig_in = peer_device->connection->int_dig_in;
1991 void *dig_vv = peer_device->connection->int_dig_vv;
b411b363 1992
11f8b2b6 1993 digest_size = 0;
69a22773 1994 if (peer_device->connection->peer_integrity_tfm) {
3d0e6375 1995 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
11f8b2b6 1996 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
a5c31904
AG
1997 if (err)
1998 return err;
11f8b2b6 1999 data_size -= digest_size;
b411b363
PR
2000 }
2001
b411b363
PR
2002 /* optimistically update recv_cnt. if receiving fails below,
2003 * we disconnect anyways, and counters will be reset. */
69a22773 2004 peer_device->device->recv_cnt += data_size>>9;
b411b363
PR
2005
2006 bio = req->master_bio;
69a22773 2007 D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
b411b363 2008
7988613b 2009 bio_for_each_segment(bvec, bio, iter) {
3eddaa60 2010 void *mapped = bvec_kmap_local(&bvec);
7988613b 2011 expect = min_t(int, data_size, bvec.bv_len);
69a22773 2012 err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
3eddaa60 2013 kunmap_local(mapped);
a5c31904
AG
2014 if (err)
2015 return err;
2016 data_size -= expect;
b411b363
PR
2017 }
2018
11f8b2b6 2019 if (digest_size) {
69a22773 2020 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
11f8b2b6 2021 if (memcmp(dig_in, dig_vv, digest_size)) {
69a22773 2022 drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
28284cef 2023 return -EINVAL;
b411b363
PR
2024 }
2025 }
2026
69a22773 2027 D_ASSERT(peer_device->device, data_size == 0);
28284cef 2028 return 0;
b411b363
PR
2029}
2030
a990be46 2031/*
668700b4 2032 * e_end_resync_block() is called in ack_sender context via
a990be46
AG
2033 * drbd_finish_peer_reqs().
2034 */
99920dc5 2035static int e_end_resync_block(struct drbd_work *w, int unused)
b411b363 2036{
8050e6d0 2037 struct drbd_peer_request *peer_req =
a8cd15ba
AG
2038 container_of(w, struct drbd_peer_request, w);
2039 struct drbd_peer_device *peer_device = peer_req->peer_device;
2040 struct drbd_device *device = peer_device->device;
db830c46 2041 sector_t sector = peer_req->i.sector;
99920dc5 2042 int err;
b411b363 2043
0b0ba1ef 2044 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
b411b363 2045
db830c46 2046 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791 2047 drbd_set_in_sync(device, sector, peer_req->i.size);
a8cd15ba 2048 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
b411b363
PR
2049 } else {
2050 /* Record failure to sync */
b30ab791 2051 drbd_rs_failed_io(device, sector, peer_req->i.size);
b411b363 2052
a8cd15ba 2053 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
b411b363 2054 }
b30ab791 2055 dec_unacked(device);
b411b363 2056
99920dc5 2057 return err;
b411b363
PR
2058}
2059
69a22773 2060static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
a0fb3c47 2061 struct packet_info *pi) __releases(local)
b411b363 2062{
69a22773 2063 struct drbd_device *device = peer_device->device;
db830c46 2064 struct drbd_peer_request *peer_req;
b411b363 2065
a0fb3c47 2066 peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
db830c46 2067 if (!peer_req)
45bb912b 2068 goto fail;
b411b363 2069
b30ab791 2070 dec_rs_pending(device);
b411b363 2071
b30ab791 2072 inc_unacked(device);
b411b363
PR
2073 /* corresponding dec_unacked() in e_end_resync_block()
2074 * respective _drbd_clear_done_ee */
2075
a8cd15ba 2076 peer_req->w.cb = e_end_resync_block;
ce668b6d 2077 peer_req->opf = REQ_OP_WRITE;
21ae5d7f 2078 peer_req->submit_jif = jiffies;
45bb912b 2079
0500813f 2080 spin_lock_irq(&device->resource->req_lock);
b9ed7080 2081 list_add_tail(&peer_req->w.list, &device->sync_ee);
0500813f 2082 spin_unlock_irq(&device->resource->req_lock);
b411b363 2083
a0fb3c47 2084 atomic_add(pi->size >> 9, &device->rs_sect_ev);
ce668b6d 2085 if (drbd_submit_peer_request(peer_req) == 0)
e1c1b0fc 2086 return 0;
b411b363 2087
10f6d992 2088 /* don't care for the reason here */
d0180171 2089 drbd_err(device, "submit failed, triggering re-connect\n");
0500813f 2090 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2091 list_del(&peer_req->w.list);
0500813f 2092 spin_unlock_irq(&device->resource->req_lock);
22cc37a9 2093
b30ab791 2094 drbd_free_peer_req(device, peer_req);
45bb912b 2095fail:
b30ab791 2096 put_ldev(device);
e1c1b0fc 2097 return -EIO;
b411b363
PR
2098}
2099
668eebc6 2100static struct drbd_request *
b30ab791 2101find_request(struct drbd_device *device, struct rb_root *root, u64 id,
bc9c5c41 2102 sector_t sector, bool missing_ok, const char *func)
51624585 2103{
51624585
AG
2104 struct drbd_request *req;
2105
bc9c5c41
AG
2106 /* Request object according to our peer */
2107 req = (struct drbd_request *)(unsigned long)id;
5e472264 2108 if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
668eebc6 2109 return req;
c3afd8f5 2110 if (!missing_ok) {
d0180171 2111 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
c3afd8f5
AG
2112 (unsigned long)id, (unsigned long long)sector);
2113 }
51624585 2114 return NULL;
b411b363
PR
2115}
2116
bde89a9e 2117static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2118{
9f4fe9ad 2119 struct drbd_peer_device *peer_device;
b30ab791 2120 struct drbd_device *device;
b411b363
PR
2121 struct drbd_request *req;
2122 sector_t sector;
82bc0194 2123 int err;
e658983a 2124 struct p_data *p = pi->data;
4a76b161 2125
9f4fe9ad
AG
2126 peer_device = conn_peer_device(connection, pi->vnr);
2127 if (!peer_device)
4a76b161 2128 return -EIO;
9f4fe9ad 2129 device = peer_device->device;
b411b363
PR
2130
2131 sector = be64_to_cpu(p->sector);
2132
0500813f 2133 spin_lock_irq(&device->resource->req_lock);
b30ab791 2134 req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
0500813f 2135 spin_unlock_irq(&device->resource->req_lock);
c3afd8f5 2136 if (unlikely(!req))
82bc0194 2137 return -EIO;
b411b363 2138
69a22773 2139 err = recv_dless_read(peer_device, req, sector, pi->size);
82bc0194 2140 if (!err)
8554df1c 2141 req_mod(req, DATA_RECEIVED);
b411b363
PR
2142 /* else: nothing. handled from drbd_disconnect...
2143 * I don't think we may complete this just yet
2144 * in case we are "on-disconnect: freeze" */
2145
82bc0194 2146 return err;
b411b363
PR
2147}
2148
bde89a9e 2149static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2150{
9f4fe9ad 2151 struct drbd_peer_device *peer_device;
b30ab791 2152 struct drbd_device *device;
b411b363 2153 sector_t sector;
82bc0194 2154 int err;
e658983a 2155 struct p_data *p = pi->data;
4a76b161 2156
9f4fe9ad
AG
2157 peer_device = conn_peer_device(connection, pi->vnr);
2158 if (!peer_device)
4a76b161 2159 return -EIO;
9f4fe9ad 2160 device = peer_device->device;
b411b363
PR
2161
2162 sector = be64_to_cpu(p->sector);
0b0ba1ef 2163 D_ASSERT(device, p->block_id == ID_SYNCER);
b411b363 2164
b30ab791 2165 if (get_ldev(device)) {
b411b363
PR
2166 /* data is submitted to disk within recv_resync_read.
2167 * corresponding put_ldev done below on error,
fcefa62e 2168 * or in drbd_peer_request_endio. */
a0fb3c47 2169 err = recv_resync_read(peer_device, sector, pi);
b411b363 2170 } else {
e3fa02d7 2171 if (drbd_ratelimit())
d0180171 2172 drbd_err(device, "Can not write resync data to local disk.\n");
b411b363 2173
69a22773 2174 err = drbd_drain_block(peer_device, pi->size);
b411b363 2175
69a22773 2176 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
b411b363
PR
2177 }
2178
b30ab791 2179 atomic_add(pi->size >> 9, &device->rs_sect_in);
778f271d 2180
82bc0194 2181 return err;
b411b363
PR
2182}
2183
b30ab791 2184static void restart_conflicting_writes(struct drbd_device *device,
7be8da07 2185 sector_t sector, int size)
b411b363 2186{
7be8da07
AG
2187 struct drbd_interval *i;
2188 struct drbd_request *req;
2189
b30ab791 2190 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2191 if (!i->local)
2192 continue;
2193 req = container_of(i, struct drbd_request, i);
2194 if (req->rq_state & RQ_LOCAL_PENDING ||
2195 !(req->rq_state & RQ_POSTPONED))
2196 continue;
2312f0b3
LE
2197 /* as it is RQ_POSTPONED, this will cause it to
2198 * be queued on the retry workqueue. */
d4dabbe2 2199 __req_mod(req, CONFLICT_RESOLVED, NULL);
7be8da07
AG
2200 }
2201}
b411b363 2202
a990be46 2203/*
668700b4 2204 * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
b411b363 2205 */
99920dc5 2206static int e_end_block(struct drbd_work *w, int cancel)
b411b363 2207{
8050e6d0 2208 struct drbd_peer_request *peer_req =
a8cd15ba
AG
2209 container_of(w, struct drbd_peer_request, w);
2210 struct drbd_peer_device *peer_device = peer_req->peer_device;
2211 struct drbd_device *device = peer_device->device;
db830c46 2212 sector_t sector = peer_req->i.sector;
99920dc5 2213 int err = 0, pcmd;
b411b363 2214
303d1448 2215 if (peer_req->flags & EE_SEND_WRITE_ACK) {
db830c46 2216 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791
AG
2217 pcmd = (device->state.conn >= C_SYNC_SOURCE &&
2218 device->state.conn <= C_PAUSED_SYNC_T &&
db830c46 2219 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
b411b363 2220 P_RS_WRITE_ACK : P_WRITE_ACK;
a8cd15ba 2221 err = drbd_send_ack(peer_device, pcmd, peer_req);
b411b363 2222 if (pcmd == P_RS_WRITE_ACK)
b30ab791 2223 drbd_set_in_sync(device, sector, peer_req->i.size);
b411b363 2224 } else {
a8cd15ba 2225 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
b411b363
PR
2226 /* we expect it to be marked out of sync anyways...
2227 * maybe assert this? */
2228 }
b30ab791 2229 dec_unacked(device);
b411b363 2230 }
08d0dabf 2231
b411b363
PR
2232 /* we delete from the conflict detection hash _after_ we sent out the
2233 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
302bdeae 2234 if (peer_req->flags & EE_IN_INTERVAL_TREE) {
0500813f 2235 spin_lock_irq(&device->resource->req_lock);
0b0ba1ef 2236 D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
b30ab791 2237 drbd_remove_epoch_entry_interval(device, peer_req);
7be8da07 2238 if (peer_req->flags & EE_RESTART_REQUESTS)
b30ab791 2239 restart_conflicting_writes(device, sector, peer_req->i.size);
0500813f 2240 spin_unlock_irq(&device->resource->req_lock);
bb3bfe96 2241 } else
0b0ba1ef 2242 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
b411b363 2243
5dd2ca19 2244 drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
b411b363 2245
99920dc5 2246 return err;
b411b363
PR
2247}
2248
a8cd15ba 2249static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
b411b363 2250{
8050e6d0 2251 struct drbd_peer_request *peer_req =
a8cd15ba
AG
2252 container_of(w, struct drbd_peer_request, w);
2253 struct drbd_peer_device *peer_device = peer_req->peer_device;
99920dc5 2254 int err;
b411b363 2255
a8cd15ba
AG
2256 err = drbd_send_ack(peer_device, ack, peer_req);
2257 dec_unacked(peer_device->device);
b411b363 2258
99920dc5 2259 return err;
b411b363
PR
2260}
2261
d4dabbe2 2262static int e_send_superseded(struct drbd_work *w, int unused)
7be8da07 2263{
a8cd15ba 2264 return e_send_ack(w, P_SUPERSEDED);
7be8da07
AG
2265}
2266
99920dc5 2267static int e_send_retry_write(struct drbd_work *w, int unused)
7be8da07 2268{
a8cd15ba
AG
2269 struct drbd_peer_request *peer_req =
2270 container_of(w, struct drbd_peer_request, w);
2271 struct drbd_connection *connection = peer_req->peer_device->connection;
7be8da07 2272
a8cd15ba 2273 return e_send_ack(w, connection->agreed_pro_version >= 100 ?
d4dabbe2 2274 P_RETRY_WRITE : P_SUPERSEDED);
7be8da07 2275}
b411b363 2276
3e394da1
AG
2277static bool seq_greater(u32 a, u32 b)
2278{
2279 /*
2280 * We assume 32-bit wrap-around here.
2281 * For 24-bit wrap-around, we would have to shift:
2282 * a <<= 8; b <<= 8;
2283 */
2284 return (s32)a - (s32)b > 0;
2285}
b411b363 2286
3e394da1
AG
2287static u32 seq_max(u32 a, u32 b)
2288{
2289 return seq_greater(a, b) ? a : b;
b411b363
PR
2290}
2291
69a22773 2292static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
3e394da1 2293{
69a22773 2294 struct drbd_device *device = peer_device->device;
3c13b680 2295 unsigned int newest_peer_seq;
3e394da1 2296
69a22773 2297 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
b30ab791
AG
2298 spin_lock(&device->peer_seq_lock);
2299 newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2300 device->peer_seq = newest_peer_seq;
2301 spin_unlock(&device->peer_seq_lock);
2302 /* wake up only if we actually changed device->peer_seq */
3c13b680 2303 if (peer_seq == newest_peer_seq)
b30ab791 2304 wake_up(&device->seq_wait);
7be8da07 2305 }
b411b363
PR
2306}
2307
d93f6302 2308static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
b6a370ba 2309{
d93f6302
LE
2310 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2311}
b6a370ba 2312
d93f6302 2313/* maybe change sync_ee into interval trees as well? */
b30ab791 2314static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
d93f6302
LE
2315{
2316 struct drbd_peer_request *rs_req;
7e5fec31 2317 bool rv = false;
b6a370ba 2318
0500813f 2319 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2320 list_for_each_entry(rs_req, &device->sync_ee, w.list) {
d93f6302
LE
2321 if (overlaps(peer_req->i.sector, peer_req->i.size,
2322 rs_req->i.sector, rs_req->i.size)) {
7e5fec31 2323 rv = true;
b6a370ba
PR
2324 break;
2325 }
2326 }
0500813f 2327 spin_unlock_irq(&device->resource->req_lock);
b6a370ba
PR
2328
2329 return rv;
2330}
2331
b411b363
PR
2332/* Called from receive_Data.
2333 * Synchronize packets on sock with packets on msock.
2334 *
2335 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2336 * packet traveling on msock, they are still processed in the order they have
2337 * been sent.
2338 *
2339 * Note: we don't care for Ack packets overtaking P_DATA packets.
2340 *
b30ab791 2341 * In case packet_seq is larger than device->peer_seq number, there are
b411b363 2342 * outstanding packets on the msock. We wait for them to arrive.
b30ab791 2343 * In case we are the logically next packet, we update device->peer_seq
b411b363
PR
2344 * ourselves. Correctly handles 32bit wrap around.
2345 *
2346 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2347 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2348 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2349 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2350 *
2351 * returns 0 if we may process the packet,
2352 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
69a22773 2353static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
b411b363 2354{
69a22773 2355 struct drbd_device *device = peer_device->device;
b411b363 2356 DEFINE_WAIT(wait);
b411b363 2357 long timeout;
b874d231 2358 int ret = 0, tp;
7be8da07 2359
69a22773 2360 if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
7be8da07
AG
2361 return 0;
2362
b30ab791 2363 spin_lock(&device->peer_seq_lock);
b411b363 2364 for (;;) {
b30ab791
AG
2365 if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2366 device->peer_seq = seq_max(device->peer_seq, peer_seq);
b411b363 2367 break;
7be8da07 2368 }
b874d231 2369
b411b363
PR
2370 if (signal_pending(current)) {
2371 ret = -ERESTARTSYS;
2372 break;
2373 }
b874d231
PR
2374
2375 rcu_read_lock();
5dd2ca19 2376 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
b874d231
PR
2377 rcu_read_unlock();
2378
2379 if (!tp)
2380 break;
2381
2382 /* Only need to wait if two_primaries is enabled */
b30ab791
AG
2383 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2384 spin_unlock(&device->peer_seq_lock);
44ed167d 2385 rcu_read_lock();
69a22773 2386 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
44ed167d 2387 rcu_read_unlock();
71b1c1eb 2388 timeout = schedule_timeout(timeout);
b30ab791 2389 spin_lock(&device->peer_seq_lock);
7be8da07 2390 if (!timeout) {
b411b363 2391 ret = -ETIMEDOUT;
d0180171 2392 drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
b411b363
PR
2393 break;
2394 }
2395 }
b30ab791
AG
2396 spin_unlock(&device->peer_seq_lock);
2397 finish_wait(&device->seq_wait, &wait);
b411b363
PR
2398 return ret;
2399}
2400
9945172a 2401static enum req_op wire_flags_to_bio_op(u32 dpf)
bb3cc85e 2402{
f31e583a 2403 if (dpf & DP_ZEROES)
45c21793 2404 return REQ_OP_WRITE_ZEROES;
f31e583a
LE
2405 if (dpf & DP_DISCARD)
2406 return REQ_OP_DISCARD;
bb3cc85e
MC
2407 else
2408 return REQ_OP_WRITE;
76d2e7ec
PR
2409}
2410
ce668b6d
CB
2411/* see also bio_flags_to_wire() */
2412static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf)
2413{
2414 return wire_flags_to_bio_op(dpf) |
2415 (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2416 (dpf & DP_FUA ? REQ_FUA : 0) |
2417 (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
2418}
2419
b30ab791 2420static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
7be8da07
AG
2421 unsigned int size)
2422{
2423 struct drbd_interval *i;
2424
2425 repeat:
b30ab791 2426 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2427 struct drbd_request *req;
2428 struct bio_and_error m;
2429
2430 if (!i->local)
2431 continue;
2432 req = container_of(i, struct drbd_request, i);
2433 if (!(req->rq_state & RQ_POSTPONED))
2434 continue;
2435 req->rq_state &= ~RQ_POSTPONED;
2436 __req_mod(req, NEG_ACKED, &m);
0500813f 2437 spin_unlock_irq(&device->resource->req_lock);
7be8da07 2438 if (m.bio)
b30ab791 2439 complete_master_bio(device, &m);
0500813f 2440 spin_lock_irq(&device->resource->req_lock);
7be8da07
AG
2441 goto repeat;
2442 }
2443}
2444
b30ab791 2445static int handle_write_conflicts(struct drbd_device *device,
7be8da07
AG
2446 struct drbd_peer_request *peer_req)
2447{
e33b32de 2448 struct drbd_connection *connection = peer_req->peer_device->connection;
bde89a9e 2449 bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
7be8da07
AG
2450 sector_t sector = peer_req->i.sector;
2451 const unsigned int size = peer_req->i.size;
2452 struct drbd_interval *i;
2453 bool equal;
2454 int err;
2455
2456 /*
2457 * Inserting the peer request into the write_requests tree will prevent
2458 * new conflicting local requests from being added.
2459 */
b30ab791 2460 drbd_insert_interval(&device->write_requests, &peer_req->i);
7be8da07
AG
2461
2462 repeat:
b30ab791 2463 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2464 if (i == &peer_req->i)
2465 continue;
08d0dabf
LE
2466 if (i->completed)
2467 continue;
7be8da07
AG
2468
2469 if (!i->local) {
2470 /*
2471 * Our peer has sent a conflicting remote request; this
2472 * should not happen in a two-node setup. Wait for the
2473 * earlier peer request to complete.
2474 */
b30ab791 2475 err = drbd_wait_misc(device, i);
7be8da07
AG
2476 if (err)
2477 goto out;
2478 goto repeat;
2479 }
2480
2481 equal = i->sector == sector && i->size == size;
2482 if (resolve_conflicts) {
2483 /*
2484 * If the peer request is fully contained within the
d4dabbe2
LE
2485 * overlapping request, it can be considered overwritten
2486 * and thus superseded; otherwise, it will be retried
2487 * once all overlapping requests have completed.
7be8da07 2488 */
d4dabbe2 2489 bool superseded = i->sector <= sector && i->sector +
7be8da07
AG
2490 (i->size >> 9) >= sector + (size >> 9);
2491
2492 if (!equal)
d0180171 2493 drbd_alert(device, "Concurrent writes detected: "
7be8da07
AG
2494 "local=%llus +%u, remote=%llus +%u, "
2495 "assuming %s came first\n",
2496 (unsigned long long)i->sector, i->size,
2497 (unsigned long long)sector, size,
d4dabbe2 2498 superseded ? "local" : "remote");
7be8da07 2499
a8cd15ba 2500 peer_req->w.cb = superseded ? e_send_superseded :
7be8da07 2501 e_send_retry_write;
a8cd15ba 2502 list_add_tail(&peer_req->w.list, &device->done_ee);
668700b4 2503 queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
7be8da07
AG
2504
2505 err = -ENOENT;
2506 goto out;
2507 } else {
2508 struct drbd_request *req =
2509 container_of(i, struct drbd_request, i);
2510
2511 if (!equal)
d0180171 2512 drbd_alert(device, "Concurrent writes detected: "
7be8da07
AG
2513 "local=%llus +%u, remote=%llus +%u\n",
2514 (unsigned long long)i->sector, i->size,
2515 (unsigned long long)sector, size);
2516
2517 if (req->rq_state & RQ_LOCAL_PENDING ||
2518 !(req->rq_state & RQ_POSTPONED)) {
2519 /*
2520 * Wait for the node with the discard flag to
d4dabbe2
LE
2521 * decide if this request has been superseded
2522 * or needs to be retried.
2523 * Requests that have been superseded will
7be8da07
AG
2524 * disappear from the write_requests tree.
2525 *
2526 * In addition, wait for the conflicting
2527 * request to finish locally before submitting
2528 * the conflicting peer request.
2529 */
b30ab791 2530 err = drbd_wait_misc(device, &req->i);
7be8da07 2531 if (err) {
e33b32de 2532 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
b30ab791 2533 fail_postponed_requests(device, sector, size);
7be8da07
AG
2534 goto out;
2535 }
2536 goto repeat;
2537 }
2538 /*
2539 * Remember to restart the conflicting requests after
2540 * the new peer request has completed.
2541 */
2542 peer_req->flags |= EE_RESTART_REQUESTS;
2543 }
2544 }
2545 err = 0;
2546
2547 out:
2548 if (err)
b30ab791 2549 drbd_remove_epoch_entry_interval(device, peer_req);
7be8da07
AG
2550 return err;
2551}
2552
b411b363 2553/* mirrored write */
bde89a9e 2554static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2555{
9f4fe9ad 2556 struct drbd_peer_device *peer_device;
b30ab791 2557 struct drbd_device *device;
21ae5d7f 2558 struct net_conf *nc;
b411b363 2559 sector_t sector;
db830c46 2560 struct drbd_peer_request *peer_req;
e658983a 2561 struct p_data *p = pi->data;
7be8da07 2562 u32 peer_seq = be32_to_cpu(p->seq_num);
b411b363 2563 u32 dp_flags;
302bdeae 2564 int err, tp;
b411b363 2565
9f4fe9ad
AG
2566 peer_device = conn_peer_device(connection, pi->vnr);
2567 if (!peer_device)
4a76b161 2568 return -EIO;
9f4fe9ad 2569 device = peer_device->device;
b411b363 2570
b30ab791 2571 if (!get_ldev(device)) {
82bc0194
AG
2572 int err2;
2573
69a22773
AG
2574 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2575 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
bde89a9e 2576 atomic_inc(&connection->current_epoch->epoch_size);
69a22773 2577 err2 = drbd_drain_block(peer_device, pi->size);
82bc0194
AG
2578 if (!err)
2579 err = err2;
2580 return err;
b411b363
PR
2581 }
2582
fcefa62e
AG
2583 /*
2584 * Corresponding put_ldev done either below (on various errors), or in
2585 * drbd_peer_request_endio, if we successfully submit the data at the
2586 * end of this function.
2587 */
b411b363
PR
2588
2589 sector = be64_to_cpu(p->sector);
a0fb3c47 2590 peer_req = read_in_block(peer_device, p->block_id, sector, pi);
db830c46 2591 if (!peer_req) {
b30ab791 2592 put_ldev(device);
82bc0194 2593 return -EIO;
b411b363
PR
2594 }
2595
a8cd15ba 2596 peer_req->w.cb = e_end_block;
21ae5d7f
LE
2597 peer_req->submit_jif = jiffies;
2598 peer_req->flags |= EE_APPLICATION;
b411b363 2599
688593c5 2600 dp_flags = be32_to_cpu(p->dp_flags);
ce668b6d 2601 peer_req->opf = wire_flags_to_bio(connection, dp_flags);
a0fb3c47 2602 if (pi->cmd == P_TRIM) {
f31e583a 2603 D_ASSERT(peer_device, peer_req->i.size > 0);
ce668b6d 2604 D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD);
f31e583a
LE
2605 D_ASSERT(peer_device, peer_req->pages == NULL);
2606 /* need to play safe: an older DRBD sender
2607 * may mean zero-out while sending P_TRIM. */
2608 if (0 == (connection->agreed_features & DRBD_FF_WZEROES))
2609 peer_req->flags |= EE_ZEROOUT;
2610 } else if (pi->cmd == P_ZEROES) {
a0fb3c47 2611 D_ASSERT(peer_device, peer_req->i.size > 0);
ce668b6d 2612 D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES);
a0fb3c47 2613 D_ASSERT(peer_device, peer_req->pages == NULL);
f31e583a
LE
2614 /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
2615 if (dp_flags & DP_DISCARD)
2616 peer_req->flags |= EE_TRIM;
a0fb3c47 2617 } else if (peer_req->pages == NULL) {
0b0ba1ef
AG
2618 D_ASSERT(device, peer_req->i.size == 0);
2619 D_ASSERT(device, dp_flags & DP_FLUSH);
a73ff323 2620 }
688593c5
LE
2621
2622 if (dp_flags & DP_MAY_SET_IN_SYNC)
db830c46 2623 peer_req->flags |= EE_MAY_SET_IN_SYNC;
688593c5 2624
bde89a9e
AG
2625 spin_lock(&connection->epoch_lock);
2626 peer_req->epoch = connection->current_epoch;
db830c46
AG
2627 atomic_inc(&peer_req->epoch->epoch_size);
2628 atomic_inc(&peer_req->epoch->active);
bde89a9e 2629 spin_unlock(&connection->epoch_lock);
b411b363 2630
302bdeae 2631 rcu_read_lock();
21ae5d7f
LE
2632 nc = rcu_dereference(peer_device->connection->net_conf);
2633 tp = nc->two_primaries;
2634 if (peer_device->connection->agreed_pro_version < 100) {
2635 switch (nc->wire_protocol) {
2636 case DRBD_PROT_C:
2637 dp_flags |= DP_SEND_WRITE_ACK;
2638 break;
2639 case DRBD_PROT_B:
2640 dp_flags |= DP_SEND_RECEIVE_ACK;
2641 break;
2642 }
2643 }
302bdeae 2644 rcu_read_unlock();
21ae5d7f
LE
2645
2646 if (dp_flags & DP_SEND_WRITE_ACK) {
2647 peer_req->flags |= EE_SEND_WRITE_ACK;
2648 inc_unacked(device);
2649 /* corresponding dec_unacked() in e_end_block()
2650 * respective _drbd_clear_done_ee */
2651 }
2652
2653 if (dp_flags & DP_SEND_RECEIVE_ACK) {
2654 /* I really don't like it that the receiver thread
2655 * sends on the msock, but anyways */
5dd2ca19 2656 drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
21ae5d7f
LE
2657 }
2658
302bdeae 2659 if (tp) {
21ae5d7f
LE
2660 /* two primaries implies protocol C */
2661 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
302bdeae 2662 peer_req->flags |= EE_IN_INTERVAL_TREE;
69a22773 2663 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
7be8da07 2664 if (err)
b411b363 2665 goto out_interrupted;
0500813f 2666 spin_lock_irq(&device->resource->req_lock);
b30ab791 2667 err = handle_write_conflicts(device, peer_req);
7be8da07 2668 if (err) {
0500813f 2669 spin_unlock_irq(&device->resource->req_lock);
7be8da07 2670 if (err == -ENOENT) {
b30ab791 2671 put_ldev(device);
82bc0194 2672 return 0;
b411b363 2673 }
7be8da07 2674 goto out_interrupted;
b411b363 2675 }
b874d231 2676 } else {
69a22773 2677 update_peer_seq(peer_device, peer_seq);
0500813f 2678 spin_lock_irq(&device->resource->req_lock);
b874d231 2679 }
a34592ff 2680 /* TRIM and is processed synchronously,
9104d31a 2681 * we wait for all pending requests, respectively wait for
a0fb3c47
LE
2682 * active_ee to become empty in drbd_submit_peer_request();
2683 * better not add ourselves here. */
a34592ff 2684 if ((peer_req->flags & (EE_TRIM | EE_ZEROOUT)) == 0)
b9ed7080 2685 list_add_tail(&peer_req->w.list, &device->active_ee);
0500813f 2686 spin_unlock_irq(&device->resource->req_lock);
b411b363 2687
b30ab791
AG
2688 if (device->state.conn == C_SYNC_TARGET)
2689 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
b411b363 2690
b30ab791 2691 if (device->state.pdsk < D_INCONSISTENT) {
b411b363 2692 /* In case we have the only disk of the cluster, */
b30ab791 2693 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
db830c46 2694 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
4dd726f0 2695 drbd_al_begin_io(device, &peer_req->i);
21ae5d7f 2696 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
b411b363
PR
2697 }
2698
ce668b6d 2699 err = drbd_submit_peer_request(peer_req);
82bc0194
AG
2700 if (!err)
2701 return 0;
b411b363 2702
10f6d992 2703 /* don't care for the reason here */
d0180171 2704 drbd_err(device, "submit failed, triggering re-connect\n");
0500813f 2705 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2706 list_del(&peer_req->w.list);
b30ab791 2707 drbd_remove_epoch_entry_interval(device, peer_req);
0500813f 2708 spin_unlock_irq(&device->resource->req_lock);
21ae5d7f
LE
2709 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2710 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
b30ab791 2711 drbd_al_complete_io(device, &peer_req->i);
21ae5d7f 2712 }
22cc37a9 2713
b411b363 2714out_interrupted:
7e5fec31 2715 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
b30ab791
AG
2716 put_ldev(device);
2717 drbd_free_peer_req(device, peer_req);
82bc0194 2718 return err;
b411b363
PR
2719}
2720
0f0601f4
LE
2721/* We may throttle resync, if the lower device seems to be busy,
2722 * and current sync rate is above c_min_rate.
2723 *
2724 * To decide whether or not the lower device is busy, we use a scheme similar
2725 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2726 * (more than 64 sectors) of activity we cannot account for with our own resync
2727 * activity, it obviously is "busy".
2728 *
2729 * The current sync rate used here uses only the most recent two step marks,
2730 * to have a short time average so we can react faster.
2731 */
ad3fee79
LE
2732bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2733 bool throttle_if_app_is_waiting)
0f0601f4 2734{
e3555d85 2735 struct lc_element *tmp;
ad3fee79 2736 bool throttle = drbd_rs_c_min_rate_throttle(device);
daeda1cc 2737
ad3fee79
LE
2738 if (!throttle || throttle_if_app_is_waiting)
2739 return throttle;
0f0601f4 2740
b30ab791
AG
2741 spin_lock_irq(&device->al_lock);
2742 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
e3555d85
PR
2743 if (tmp) {
2744 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
e8299874
LE
2745 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2746 throttle = false;
ad3fee79
LE
2747 /* Do not slow down if app IO is already waiting for this extent,
2748 * and our progress is necessary for application IO to complete. */
e3555d85 2749 }
b30ab791 2750 spin_unlock_irq(&device->al_lock);
e3555d85 2751
e8299874
LE
2752 return throttle;
2753}
2754
2755bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2756{
8c40c7c4 2757 struct gendisk *disk = device->ldev->backing_bdev->bd_disk;
e8299874
LE
2758 unsigned long db, dt, dbdt;
2759 unsigned int c_min_rate;
2760 int curr_events;
2761
2762 rcu_read_lock();
2763 c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2764 rcu_read_unlock();
2765
2766 /* feature disabled? */
2767 if (c_min_rate == 0)
2768 return false;
2769
8446fe92 2770 curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
b30ab791 2771 atomic_read(&device->rs_sect_ev);
ad3fee79
LE
2772
2773 if (atomic_read(&device->ap_actlog_cnt)
ff8bd88b 2774 || curr_events - device->rs_last_events > 64) {
0f0601f4
LE
2775 unsigned long rs_left;
2776 int i;
2777
b30ab791 2778 device->rs_last_events = curr_events;
0f0601f4
LE
2779
2780 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2781 * approx. */
b30ab791 2782 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2649f080 2783
b30ab791
AG
2784 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2785 rs_left = device->ov_left;
2649f080 2786 else
b30ab791 2787 rs_left = drbd_bm_total_weight(device) - device->rs_failed;
0f0601f4 2788
b30ab791 2789 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
0f0601f4
LE
2790 if (!dt)
2791 dt++;
b30ab791 2792 db = device->rs_mark_left[i] - rs_left;
0f0601f4
LE
2793 dbdt = Bit2KB(db/dt);
2794
daeda1cc 2795 if (dbdt > c_min_rate)
e8299874 2796 return true;
0f0601f4 2797 }
e8299874 2798 return false;
0f0601f4
LE
2799}
2800
bde89a9e 2801static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2802{
9f4fe9ad 2803 struct drbd_peer_device *peer_device;
b30ab791 2804 struct drbd_device *device;
b411b363 2805 sector_t sector;
4a76b161 2806 sector_t capacity;
db830c46 2807 struct drbd_peer_request *peer_req;
b411b363 2808 struct digest_info *di = NULL;
b18b37be 2809 int size, verb;
e658983a 2810 struct p_block_req *p = pi->data;
4a76b161 2811
9f4fe9ad
AG
2812 peer_device = conn_peer_device(connection, pi->vnr);
2813 if (!peer_device)
4a76b161 2814 return -EIO;
9f4fe9ad 2815 device = peer_device->device;
155bd9d1 2816 capacity = get_capacity(device->vdisk);
b411b363
PR
2817
2818 sector = be64_to_cpu(p->sector);
2819 size = be32_to_cpu(p->blksize);
2820
c670a398 2821 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
d0180171 2822 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
b411b363 2823 (unsigned long long)sector, size);
82bc0194 2824 return -EINVAL;
b411b363
PR
2825 }
2826 if (sector + (size>>9) > capacity) {
d0180171 2827 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
b411b363 2828 (unsigned long long)sector, size);
82bc0194 2829 return -EINVAL;
b411b363
PR
2830 }
2831
b30ab791 2832 if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
b18b37be 2833 verb = 1;
e2857216 2834 switch (pi->cmd) {
b18b37be 2835 case P_DATA_REQUEST:
69a22773 2836 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
b18b37be 2837 break;
700ca8c0 2838 case P_RS_THIN_REQ:
b18b37be
PR
2839 case P_RS_DATA_REQUEST:
2840 case P_CSUM_RS_REQUEST:
2841 case P_OV_REQUEST:
69a22773 2842 drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
b18b37be
PR
2843 break;
2844 case P_OV_REPLY:
2845 verb = 0;
b30ab791 2846 dec_rs_pending(device);
69a22773 2847 drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
b18b37be
PR
2848 break;
2849 default:
49ba9b1b 2850 BUG();
b18b37be 2851 }
e3fa02d7 2852 if (verb && drbd_ratelimit())
d0180171 2853 drbd_err(device, "Can not satisfy peer's read request, "
b411b363 2854 "no local data.\n");
b18b37be 2855
a821cc4a 2856 /* drain possibly payload */
69a22773 2857 return drbd_drain_block(peer_device, pi->size);
b411b363
PR
2858 }
2859
2860 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2861 * "criss-cross" setup, that might cause write-out on some other DRBD,
2862 * which in turn might block on the other node at this very place. */
a0fb3c47 2863 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
9104d31a 2864 size, GFP_NOIO);
db830c46 2865 if (!peer_req) {
b30ab791 2866 put_ldev(device);
82bc0194 2867 return -ENOMEM;
b411b363 2868 }
ce668b6d 2869 peer_req->opf = REQ_OP_READ;
b411b363 2870
e2857216 2871 switch (pi->cmd) {
b411b363 2872 case P_DATA_REQUEST:
a8cd15ba 2873 peer_req->w.cb = w_e_end_data_req;
80a40e43 2874 /* application IO, don't drbd_rs_begin_io */
21ae5d7f 2875 peer_req->flags |= EE_APPLICATION;
80a40e43
LE
2876 goto submit;
2877
700ca8c0
PR
2878 case P_RS_THIN_REQ:
2879 /* If at some point in the future we have a smart way to
2880 find out if this data block is completely deallocated,
2881 then we would do something smarter here than reading
2882 the block... */
2883 peer_req->flags |= EE_RS_THIN_REQ;
df561f66 2884 fallthrough;
b411b363 2885 case P_RS_DATA_REQUEST:
a8cd15ba 2886 peer_req->w.cb = w_e_end_rsdata_req;
5f9915bb 2887 /* used in the sector offset progress display */
b30ab791 2888 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
b411b363
PR
2889 break;
2890
2891 case P_OV_REPLY:
2892 case P_CSUM_RS_REQUEST:
e2857216 2893 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
b411b363
PR
2894 if (!di)
2895 goto out_free_e;
2896
e2857216 2897 di->digest_size = pi->size;
b411b363
PR
2898 di->digest = (((char *)di)+sizeof(struct digest_info));
2899
db830c46
AG
2900 peer_req->digest = di;
2901 peer_req->flags |= EE_HAS_DIGEST;
c36c3ced 2902
9f4fe9ad 2903 if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
b411b363
PR
2904 goto out_free_e;
2905
e2857216 2906 if (pi->cmd == P_CSUM_RS_REQUEST) {
9f4fe9ad 2907 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
a8cd15ba 2908 peer_req->w.cb = w_e_end_csum_rs_req;
5f9915bb 2909 /* used in the sector offset progress display */
b30ab791 2910 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
aaaba345
LE
2911 /* remember to report stats in drbd_resync_finished */
2912 device->use_csums = true;
e2857216 2913 } else if (pi->cmd == P_OV_REPLY) {
2649f080 2914 /* track progress, we may need to throttle */
b30ab791 2915 atomic_add(size >> 9, &device->rs_sect_in);
a8cd15ba 2916 peer_req->w.cb = w_e_end_ov_reply;
b30ab791 2917 dec_rs_pending(device);
0f0601f4
LE
2918 /* drbd_rs_begin_io done when we sent this request,
2919 * but accounting still needs to be done. */
2920 goto submit_for_resync;
b411b363
PR
2921 }
2922 break;
2923
2924 case P_OV_REQUEST:
b30ab791 2925 if (device->ov_start_sector == ~(sector_t)0 &&
9f4fe9ad 2926 peer_device->connection->agreed_pro_version >= 90) {
de228bba
LE
2927 unsigned long now = jiffies;
2928 int i;
b30ab791
AG
2929 device->ov_start_sector = sector;
2930 device->ov_position = sector;
2931 device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2932 device->rs_total = device->ov_left;
de228bba 2933 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
b30ab791
AG
2934 device->rs_mark_left[i] = device->ov_left;
2935 device->rs_mark_time[i] = now;
de228bba 2936 }
d0180171 2937 drbd_info(device, "Online Verify start sector: %llu\n",
b411b363
PR
2938 (unsigned long long)sector);
2939 }
a8cd15ba 2940 peer_req->w.cb = w_e_end_ov_req;
b411b363
PR
2941 break;
2942
b411b363 2943 default:
49ba9b1b 2944 BUG();
b411b363
PR
2945 }
2946
0f0601f4
LE
2947 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2948 * wrt the receiver, but it is not as straightforward as it may seem.
2949 * Various places in the resync start and stop logic assume resync
2950 * requests are processed in order, requeuing this on the worker thread
2951 * introduces a bunch of new code for synchronization between threads.
2952 *
2953 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2954 * "forever", throttling after drbd_rs_begin_io will lock that extent
2955 * for application writes for the same time. For now, just throttle
2956 * here, where the rest of the code expects the receiver to sleep for
2957 * a while, anyways.
2958 */
2959
2960 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2961 * this defers syncer requests for some time, before letting at least
2962 * on request through. The resync controller on the receiving side
2963 * will adapt to the incoming rate accordingly.
2964 *
2965 * We cannot throttle here if remote is Primary/SyncTarget:
2966 * we would also throttle its application reads.
2967 * In that case, throttling is done on the SyncTarget only.
2968 */
c5a2c150
LE
2969
2970 /* Even though this may be a resync request, we do add to "read_ee";
2971 * "sync_ee" is only used for resync WRITEs.
2972 * Add to list early, so debugfs can find this request
2973 * even if we have to sleep below. */
2974 spin_lock_irq(&device->resource->req_lock);
2975 list_add_tail(&peer_req->w.list, &device->read_ee);
2976 spin_unlock_irq(&device->resource->req_lock);
2977
944410e9 2978 update_receiver_timing_details(connection, drbd_rs_should_slow_down);
ad3fee79
LE
2979 if (device->state.peer != R_PRIMARY
2980 && drbd_rs_should_slow_down(device, sector, false))
e3555d85 2981 schedule_timeout_uninterruptible(HZ/10);
944410e9 2982 update_receiver_timing_details(connection, drbd_rs_begin_io);
b30ab791 2983 if (drbd_rs_begin_io(device, sector))
80a40e43 2984 goto out_free_e;
b411b363 2985
0f0601f4 2986submit_for_resync:
b30ab791 2987 atomic_add(size >> 9, &device->rs_sect_ev);
0f0601f4 2988
80a40e43 2989submit:
944410e9 2990 update_receiver_timing_details(connection, drbd_submit_peer_request);
b30ab791 2991 inc_unacked(device);
ce668b6d 2992 if (drbd_submit_peer_request(peer_req) == 0)
82bc0194 2993 return 0;
b411b363 2994
10f6d992 2995 /* don't care for the reason here */
d0180171 2996 drbd_err(device, "submit failed, triggering re-connect\n");
c5a2c150
LE
2997
2998out_free_e:
0500813f 2999 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 3000 list_del(&peer_req->w.list);
0500813f 3001 spin_unlock_irq(&device->resource->req_lock);
22cc37a9
LE
3002 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
3003
b30ab791
AG
3004 put_ldev(device);
3005 drbd_free_peer_req(device, peer_req);
82bc0194 3006 return -EIO;
b411b363
PR
3007}
3008
9b48ff07 3009/*
69a22773
AG
3010 * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries
3011 */
3012static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 3013{
69a22773 3014 struct drbd_device *device = peer_device->device;
b411b363
PR
3015 int self, peer, rv = -100;
3016 unsigned long ch_self, ch_peer;
44ed167d 3017 enum drbd_after_sb_p after_sb_0p;
b411b363 3018
b30ab791
AG
3019 self = device->ldev->md.uuid[UI_BITMAP] & 1;
3020 peer = device->p_uuid[UI_BITMAP] & 1;
b411b363 3021
b30ab791
AG
3022 ch_peer = device->p_uuid[UI_SIZE];
3023 ch_self = device->comm_bm_set;
b411b363 3024
44ed167d 3025 rcu_read_lock();
69a22773 3026 after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
44ed167d
PR
3027 rcu_read_unlock();
3028 switch (after_sb_0p) {
b411b363
PR
3029 case ASB_CONSENSUS:
3030 case ASB_DISCARD_SECONDARY:
3031 case ASB_CALL_HELPER:
44ed167d 3032 case ASB_VIOLENTLY:
d0180171 3033 drbd_err(device, "Configuration error.\n");
b411b363
PR
3034 break;
3035 case ASB_DISCONNECT:
3036 break;
3037 case ASB_DISCARD_YOUNGER_PRI:
3038 if (self == 0 && peer == 1) {
3039 rv = -1;
3040 break;
3041 }
3042 if (self == 1 && peer == 0) {
3043 rv = 1;
3044 break;
3045 }
df561f66 3046 fallthrough; /* to one of the other strategies */
b411b363
PR
3047 case ASB_DISCARD_OLDER_PRI:
3048 if (self == 0 && peer == 1) {
3049 rv = 1;
3050 break;
3051 }
3052 if (self == 1 && peer == 0) {
3053 rv = -1;
3054 break;
3055 }
3056 /* Else fall through to one of the other strategies... */
d0180171 3057 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
b411b363 3058 "Using discard-least-changes instead\n");
df561f66 3059 fallthrough;
b411b363
PR
3060 case ASB_DISCARD_ZERO_CHG:
3061 if (ch_peer == 0 && ch_self == 0) {
69a22773 3062 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
b411b363
PR
3063 ? -1 : 1;
3064 break;
3065 } else {
3066 if (ch_peer == 0) { rv = 1; break; }
3067 if (ch_self == 0) { rv = -1; break; }
3068 }
44ed167d 3069 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
b411b363 3070 break;
df561f66 3071 fallthrough;
b411b363
PR
3072 case ASB_DISCARD_LEAST_CHG:
3073 if (ch_self < ch_peer)
3074 rv = -1;
3075 else if (ch_self > ch_peer)
3076 rv = 1;
3077 else /* ( ch_self == ch_peer ) */
3078 /* Well, then use something else. */
69a22773 3079 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
b411b363
PR
3080 ? -1 : 1;
3081 break;
3082 case ASB_DISCARD_LOCAL:
3083 rv = -1;
3084 break;
3085 case ASB_DISCARD_REMOTE:
3086 rv = 1;
3087 }
3088
3089 return rv;
3090}
3091
9b48ff07 3092/*
69a22773
AG
3093 * drbd_asb_recover_1p - Recover after split-brain with one remaining primary
3094 */
3095static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 3096{
69a22773 3097 struct drbd_device *device = peer_device->device;
6184ea21 3098 int hg, rv = -100;
44ed167d 3099 enum drbd_after_sb_p after_sb_1p;
b411b363 3100
44ed167d 3101 rcu_read_lock();
69a22773 3102 after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
44ed167d
PR
3103 rcu_read_unlock();
3104 switch (after_sb_1p) {
b411b363
PR
3105 case ASB_DISCARD_YOUNGER_PRI:
3106 case ASB_DISCARD_OLDER_PRI:
3107 case ASB_DISCARD_LEAST_CHG:
3108 case ASB_DISCARD_LOCAL:
3109 case ASB_DISCARD_REMOTE:
44ed167d 3110 case ASB_DISCARD_ZERO_CHG:
d0180171 3111 drbd_err(device, "Configuration error.\n");
b411b363
PR
3112 break;
3113 case ASB_DISCONNECT:
3114 break;
3115 case ASB_CONSENSUS:
69a22773 3116 hg = drbd_asb_recover_0p(peer_device);
b30ab791 3117 if (hg == -1 && device->state.role == R_SECONDARY)
b411b363 3118 rv = hg;
b30ab791 3119 if (hg == 1 && device->state.role == R_PRIMARY)
b411b363
PR
3120 rv = hg;
3121 break;
3122 case ASB_VIOLENTLY:
69a22773 3123 rv = drbd_asb_recover_0p(peer_device);
b411b363
PR
3124 break;
3125 case ASB_DISCARD_SECONDARY:
b30ab791 3126 return device->state.role == R_PRIMARY ? 1 : -1;
b411b363 3127 case ASB_CALL_HELPER:
69a22773 3128 hg = drbd_asb_recover_0p(peer_device);
b30ab791 3129 if (hg == -1 && device->state.role == R_PRIMARY) {
bb437946
AG
3130 enum drbd_state_rv rv2;
3131
b411b363
PR
3132 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3133 * we might be here in C_WF_REPORT_PARAMS which is transient.
3134 * we do not need to wait for the after state change work either. */
b30ab791 3135 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
bb437946 3136 if (rv2 != SS_SUCCESS) {
b30ab791 3137 drbd_khelper(device, "pri-lost-after-sb");
b411b363 3138 } else {
d0180171 3139 drbd_warn(device, "Successfully gave up primary role.\n");
b411b363
PR
3140 rv = hg;
3141 }
3142 } else
3143 rv = hg;
3144 }
3145
3146 return rv;
3147}
3148
9b48ff07 3149/*
69a22773
AG
3150 * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries
3151 */
3152static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 3153{
69a22773 3154 struct drbd_device *device = peer_device->device;
6184ea21 3155 int hg, rv = -100;
44ed167d 3156 enum drbd_after_sb_p after_sb_2p;
b411b363 3157
44ed167d 3158 rcu_read_lock();
69a22773 3159 after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
44ed167d
PR
3160 rcu_read_unlock();
3161 switch (after_sb_2p) {
b411b363
PR
3162 case ASB_DISCARD_YOUNGER_PRI:
3163 case ASB_DISCARD_OLDER_PRI:
3164 case ASB_DISCARD_LEAST_CHG:
3165 case ASB_DISCARD_LOCAL:
3166 case ASB_DISCARD_REMOTE:
3167 case ASB_CONSENSUS:
3168 case ASB_DISCARD_SECONDARY:
44ed167d 3169 case ASB_DISCARD_ZERO_CHG:
d0180171 3170 drbd_err(device, "Configuration error.\n");
b411b363
PR
3171 break;
3172 case ASB_VIOLENTLY:
69a22773 3173 rv = drbd_asb_recover_0p(peer_device);
b411b363
PR
3174 break;
3175 case ASB_DISCONNECT:
3176 break;
3177 case ASB_CALL_HELPER:
69a22773 3178 hg = drbd_asb_recover_0p(peer_device);
b411b363 3179 if (hg == -1) {
bb437946
AG
3180 enum drbd_state_rv rv2;
3181
b411b363
PR
3182 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3183 * we might be here in C_WF_REPORT_PARAMS which is transient.
3184 * we do not need to wait for the after state change work either. */
b30ab791 3185 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
bb437946 3186 if (rv2 != SS_SUCCESS) {
b30ab791 3187 drbd_khelper(device, "pri-lost-after-sb");
b411b363 3188 } else {
d0180171 3189 drbd_warn(device, "Successfully gave up primary role.\n");
b411b363
PR
3190 rv = hg;
3191 }
3192 } else
3193 rv = hg;
3194 }
3195
3196 return rv;
3197}
3198
b30ab791 3199static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
b411b363
PR
3200 u64 bits, u64 flags)
3201{
3202 if (!uuid) {
d0180171 3203 drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
b411b363
PR
3204 return;
3205 }
d0180171 3206 drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
b411b363
PR
3207 text,
3208 (unsigned long long)uuid[UI_CURRENT],
3209 (unsigned long long)uuid[UI_BITMAP],
3210 (unsigned long long)uuid[UI_HISTORY_START],
3211 (unsigned long long)uuid[UI_HISTORY_END],
3212 (unsigned long long)bits,
3213 (unsigned long long)flags);
3214}
3215
3216/*
3217 100 after split brain try auto recover
3218 2 C_SYNC_SOURCE set BitMap
3219 1 C_SYNC_SOURCE use BitMap
3220 0 no Sync
3221 -1 C_SYNC_TARGET use BitMap
3222 -2 C_SYNC_TARGET set BitMap
3223 -100 after split brain, disconnect
3224-1000 unrelated data
4a23f264
PR
3225-1091 requires proto 91
3226-1096 requires proto 96
b411b363 3227 */
f2d3d75b
LE
3228
3229static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
b411b363 3230{
44a4d551
LE
3231 struct drbd_peer_device *const peer_device = first_peer_device(device);
3232 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
b411b363
PR
3233 u64 self, peer;
3234 int i, j;
3235
b30ab791
AG
3236 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3237 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
3238
3239 *rule_nr = 10;
3240 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
3241 return 0;
3242
3243 *rule_nr = 20;
3244 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
3245 peer != UUID_JUST_CREATED)
3246 return -2;
3247
3248 *rule_nr = 30;
3249 if (self != UUID_JUST_CREATED &&
3250 (peer == UUID_JUST_CREATED || peer == (u64)0))
3251 return 2;
3252
3253 if (self == peer) {
3254 int rct, dc; /* roles at crash time */
3255
b30ab791 3256 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
b411b363 3257
44a4d551 3258 if (connection->agreed_pro_version < 91)
4a23f264 3259 return -1091;
b411b363 3260
b30ab791
AG
3261 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
3262 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
d0180171 3263 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
b30ab791
AG
3264 drbd_uuid_move_history(device);
3265 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3266 device->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 3267
b30ab791
AG
3268 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3269 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
b411b363
PR
3270 *rule_nr = 34;
3271 } else {
d0180171 3272 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
b411b363
PR
3273 *rule_nr = 36;
3274 }
3275
3276 return 1;
3277 }
3278
b30ab791 3279 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
b411b363 3280
44a4d551 3281 if (connection->agreed_pro_version < 91)
4a23f264 3282 return -1091;
b411b363 3283
b30ab791
AG
3284 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
3285 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
d0180171 3286 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
b411b363 3287
b30ab791
AG
3288 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
3289 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
3290 device->p_uuid[UI_BITMAP] = 0UL;
b411b363 3291
b30ab791 3292 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
b411b363
PR
3293 *rule_nr = 35;
3294 } else {
d0180171 3295 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
b411b363
PR
3296 *rule_nr = 37;
3297 }
3298
3299 return -1;
3300 }
3301
3302 /* Common power [off|failure] */
b30ab791
AG
3303 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3304 (device->p_uuid[UI_FLAGS] & 2);
b411b363
PR
3305 /* lowest bit is set when we were primary,
3306 * next bit (weight 2) is set when peer was primary */
3307 *rule_nr = 40;
3308
f2d3d75b
LE
3309 /* Neither has the "crashed primary" flag set,
3310 * only a replication link hickup. */
3311 if (rct == 0)
3312 return 0;
3313
3314 /* Current UUID equal and no bitmap uuid; does not necessarily
3315 * mean this was a "simultaneous hard crash", maybe IO was
3316 * frozen, so no UUID-bump happened.
3317 * This is a protocol change, overload DRBD_FF_WSAME as flag
3318 * for "new-enough" peer DRBD version. */
3319 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3320 *rule_nr = 41;
3321 if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3322 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3323 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3324 }
3325 if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3326 /* At least one has the "crashed primary" bit set,
3327 * both are primary now, but neither has rotated its UUIDs?
3328 * "Can not happen." */
3329 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3330 return -100;
3331 }
3332 if (device->state.role == R_PRIMARY)
3333 return 1;
3334 return -1;
3335 }
3336
3337 /* Both are secondary.
3338 * Really looks like recovery from simultaneous hard crash.
3339 * Check which had been primary before, and arbitrate. */
b411b363 3340 switch (rct) {
f2d3d75b 3341 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
b411b363
PR
3342 case 1: /* self_pri && !peer_pri */ return 1;
3343 case 2: /* !self_pri && peer_pri */ return -1;
3344 case 3: /* self_pri && peer_pri */
44a4d551 3345 dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
b411b363
PR
3346 return dc ? -1 : 1;
3347 }
3348 }
3349
3350 *rule_nr = 50;
b30ab791 3351 peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
b411b363
PR
3352 if (self == peer)
3353 return -1;
3354
3355 *rule_nr = 51;
b30ab791 3356 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
b411b363 3357 if (self == peer) {
44a4d551 3358 if (connection->agreed_pro_version < 96 ?
b30ab791
AG
3359 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3360 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3361 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
3362 /* The last P_SYNC_UUID did not get though. Undo the last start of
3363 resync as sync source modifications of the peer's UUIDs. */
3364
44a4d551 3365 if (connection->agreed_pro_version < 91)
4a23f264 3366 return -1091;
b411b363 3367
b30ab791
AG
3368 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3369 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
4a23f264 3370
d0180171 3371 drbd_info(device, "Lost last syncUUID packet, corrected:\n");
b30ab791 3372 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
4a23f264 3373
b411b363
PR
3374 return -1;
3375 }
3376 }
3377
3378 *rule_nr = 60;
b30ab791 3379 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
b411b363 3380 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3381 peer = device->p_uuid[i] & ~((u64)1);
b411b363
PR
3382 if (self == peer)
3383 return -2;
3384 }
3385
3386 *rule_nr = 70;
b30ab791
AG
3387 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3388 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
3389 if (self == peer)
3390 return 1;
3391
3392 *rule_nr = 71;
b30ab791 3393 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
b411b363 3394 if (self == peer) {
44a4d551 3395 if (connection->agreed_pro_version < 96 ?
b30ab791
AG
3396 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3397 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3398 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
3399 /* The last P_SYNC_UUID did not get though. Undo the last start of
3400 resync as sync source modifications of our UUIDs. */
3401
44a4d551 3402 if (connection->agreed_pro_version < 91)
4a23f264 3403 return -1091;
b411b363 3404
b30ab791
AG
3405 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3406 __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
b411b363 3407
d0180171 3408 drbd_info(device, "Last syncUUID did not get through, corrected:\n");
b30ab791
AG
3409 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3410 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
b411b363
PR
3411
3412 return 1;
3413 }
3414 }
3415
3416
3417 *rule_nr = 80;
b30ab791 3418 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363 3419 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3420 self = device->ldev->md.uuid[i] & ~((u64)1);
b411b363
PR
3421 if (self == peer)
3422 return 2;
3423 }
3424
3425 *rule_nr = 90;
b30ab791
AG
3426 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3427 peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
b411b363
PR
3428 if (self == peer && self != ((u64)0))
3429 return 100;
3430
3431 *rule_nr = 100;
3432 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3433 self = device->ldev->md.uuid[i] & ~((u64)1);
b411b363 3434 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
b30ab791 3435 peer = device->p_uuid[j] & ~((u64)1);
b411b363
PR
3436 if (self == peer)
3437 return -100;
3438 }
3439 }
3440
3441 return -1000;
3442}
3443
3444/* drbd_sync_handshake() returns the new conn state on success, or
3445 CONN_MASK (-1) on failure.
3446 */
69a22773
AG
3447static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3448 enum drbd_role peer_role,
b411b363
PR
3449 enum drbd_disk_state peer_disk) __must_hold(local)
3450{
69a22773 3451 struct drbd_device *device = peer_device->device;
b411b363
PR
3452 enum drbd_conns rv = C_MASK;
3453 enum drbd_disk_state mydisk;
44ed167d 3454 struct net_conf *nc;
d29e89e3 3455 int hg, rule_nr, rr_conflict, tentative, always_asbp;
b411b363 3456
b30ab791 3457 mydisk = device->state.disk;
b411b363 3458 if (mydisk == D_NEGOTIATING)
b30ab791 3459 mydisk = device->new_state_tmp.disk;
b411b363 3460
d0180171 3461 drbd_info(device, "drbd_sync_handshake:\n");
9f2247bb 3462
b30ab791
AG
3463 spin_lock_irq(&device->ldev->md.uuid_lock);
3464 drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3465 drbd_uuid_dump(device, "peer", device->p_uuid,
3466 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
b411b363 3467
f2d3d75b 3468 hg = drbd_uuid_compare(device, peer_role, &rule_nr);
b30ab791 3469 spin_unlock_irq(&device->ldev->md.uuid_lock);
b411b363 3470
d0180171 3471 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
b411b363
PR
3472
3473 if (hg == -1000) {
d0180171 3474 drbd_alert(device, "Unrelated data, aborting!\n");
b411b363
PR
3475 return C_MASK;
3476 }
f2d3d75b
LE
3477 if (hg < -0x10000) {
3478 int proto, fflags;
3479 hg = -hg;
3480 proto = hg & 0xff;
3481 fflags = (hg >> 8) & 0xff;
3482 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3483 proto, fflags);
3484 return C_MASK;
3485 }
4a23f264 3486 if (hg < -1000) {
d0180171 3487 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
b411b363
PR
3488 return C_MASK;
3489 }
3490
3491 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3492 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
3493 int f = (hg == -100) || abs(hg) == 2;
3494 hg = mydisk > D_INCONSISTENT ? 1 : -1;
3495 if (f)
3496 hg = hg*2;
d0180171 3497 drbd_info(device, "Becoming sync %s due to disk states.\n",
b411b363
PR
3498 hg > 0 ? "source" : "target");
3499 }
3500
3a11a487 3501 if (abs(hg) == 100)
b30ab791 3502 drbd_khelper(device, "initial-split-brain");
3a11a487 3503
44ed167d 3504 rcu_read_lock();
69a22773 3505 nc = rcu_dereference(peer_device->connection->net_conf);
d29e89e3
RK
3506 always_asbp = nc->always_asbp;
3507 rr_conflict = nc->rr_conflict;
3508 tentative = nc->tentative;
3509 rcu_read_unlock();
44ed167d 3510
d29e89e3 3511 if (hg == 100 || (hg == -100 && always_asbp)) {
b30ab791 3512 int pcount = (device->state.role == R_PRIMARY)
b411b363
PR
3513 + (peer_role == R_PRIMARY);
3514 int forced = (hg == -100);
3515
3516 switch (pcount) {
3517 case 0:
69a22773 3518 hg = drbd_asb_recover_0p(peer_device);
b411b363
PR
3519 break;
3520 case 1:
69a22773 3521 hg = drbd_asb_recover_1p(peer_device);
b411b363
PR
3522 break;
3523 case 2:
69a22773 3524 hg = drbd_asb_recover_2p(peer_device);
b411b363
PR
3525 break;
3526 }
3527 if (abs(hg) < 100) {
d0180171 3528 drbd_warn(device, "Split-Brain detected, %d primaries, "
b411b363
PR
3529 "automatically solved. Sync from %s node\n",
3530 pcount, (hg < 0) ? "peer" : "this");
3531 if (forced) {
d0180171 3532 drbd_warn(device, "Doing a full sync, since"
b411b363
PR
3533 " UUIDs where ambiguous.\n");
3534 hg = hg*2;
3535 }
3536 }
3537 }
3538
3539 if (hg == -100) {
b30ab791 3540 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
b411b363 3541 hg = -1;
b30ab791 3542 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
b411b363
PR
3543 hg = 1;
3544
3545 if (abs(hg) < 100)
d0180171 3546 drbd_warn(device, "Split-Brain detected, manually solved. "
b411b363
PR
3547 "Sync from %s node\n",
3548 (hg < 0) ? "peer" : "this");
3549 }
3550
3551 if (hg == -100) {
580b9767
LE
3552 /* FIXME this log message is not correct if we end up here
3553 * after an attempted attach on a diskless node.
3554 * We just refuse to attach -- well, we drop the "connection"
3555 * to that disk, in a way... */
d0180171 3556 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
b30ab791 3557 drbd_khelper(device, "split-brain");
b411b363
PR
3558 return C_MASK;
3559 }
3560
3561 if (hg > 0 && mydisk <= D_INCONSISTENT) {
d0180171 3562 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
b411b363
PR
3563 return C_MASK;
3564 }
3565
3566 if (hg < 0 && /* by intention we do not use mydisk here. */
b30ab791 3567 device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
44ed167d 3568 switch (rr_conflict) {
b411b363 3569 case ASB_CALL_HELPER:
b30ab791 3570 drbd_khelper(device, "pri-lost");
df561f66 3571 fallthrough;
b411b363 3572 case ASB_DISCONNECT:
d0180171 3573 drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
b411b363
PR
3574 return C_MASK;
3575 case ASB_VIOLENTLY:
d0180171 3576 drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
b411b363
PR
3577 "assumption\n");
3578 }
3579 }
3580
69a22773 3581 if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
cf14c2e9 3582 if (hg == 0)
d0180171 3583 drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
cf14c2e9 3584 else
d0180171 3585 drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
cf14c2e9
PR
3586 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3587 abs(hg) >= 2 ? "full" : "bit-map based");
3588 return C_MASK;
3589 }
3590
b411b363 3591 if (abs(hg) >= 2) {
d0180171 3592 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
b30ab791 3593 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
20ceb2b2 3594 BM_LOCKED_SET_ALLOWED))
b411b363
PR
3595 return C_MASK;
3596 }
3597
3598 if (hg > 0) { /* become sync source. */
3599 rv = C_WF_BITMAP_S;
3600 } else if (hg < 0) { /* become sync target */
3601 rv = C_WF_BITMAP_T;
3602 } else {
3603 rv = C_CONNECTED;
b30ab791 3604 if (drbd_bm_total_weight(device)) {
d0180171 3605 drbd_info(device, "No resync, but %lu bits in bitmap!\n",
b30ab791 3606 drbd_bm_total_weight(device));
b411b363
PR
3607 }
3608 }
3609
3610 return rv;
3611}
3612
f179d76d 3613static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
b411b363
PR
3614{
3615 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
f179d76d
PR
3616 if (peer == ASB_DISCARD_REMOTE)
3617 return ASB_DISCARD_LOCAL;
b411b363
PR
3618
3619 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
f179d76d
PR
3620 if (peer == ASB_DISCARD_LOCAL)
3621 return ASB_DISCARD_REMOTE;
b411b363
PR
3622
3623 /* everything else is valid if they are equal on both sides. */
f179d76d 3624 return peer;
b411b363
PR
3625}
3626
bde89a9e 3627static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3628{
e658983a 3629 struct p_protocol *p = pi->data;
036b17ea
PR
3630 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3631 int p_proto, p_discard_my_data, p_two_primaries, cf;
3632 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3633 char integrity_alg[SHARED_SECRET_MAX] = "";
3d0e6375 3634 struct crypto_shash *peer_integrity_tfm = NULL;
7aca6c75 3635 void *int_dig_in = NULL, *int_dig_vv = NULL;
b411b363 3636
b411b363
PR
3637 p_proto = be32_to_cpu(p->protocol);
3638 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
3639 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
3640 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
b411b363 3641 p_two_primaries = be32_to_cpu(p->two_primaries);
cf14c2e9 3642 cf = be32_to_cpu(p->conn_flags);
6139f60d 3643 p_discard_my_data = cf & CF_DISCARD_MY_DATA;
cf14c2e9 3644
bde89a9e 3645 if (connection->agreed_pro_version >= 87) {
86db0618 3646 int err;
cf14c2e9 3647
88104ca4 3648 if (pi->size > sizeof(integrity_alg))
86db0618 3649 return -EIO;
bde89a9e 3650 err = drbd_recv_all(connection, integrity_alg, pi->size);
86db0618
AG
3651 if (err)
3652 return err;
036b17ea 3653 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
b411b363
PR
3654 }
3655
7d4c782c 3656 if (pi->cmd != P_PROTOCOL_UPDATE) {
bde89a9e 3657 clear_bit(CONN_DRY_RUN, &connection->flags);
b411b363 3658
fbc12f45 3659 if (cf & CF_DRY_RUN)
bde89a9e 3660 set_bit(CONN_DRY_RUN, &connection->flags);
b411b363 3661
fbc12f45 3662 rcu_read_lock();
bde89a9e 3663 nc = rcu_dereference(connection->net_conf);
b411b363 3664
fbc12f45 3665 if (p_proto != nc->wire_protocol) {
1ec861eb 3666 drbd_err(connection, "incompatible %s settings\n", "protocol");
fbc12f45
AG
3667 goto disconnect_rcu_unlock;
3668 }
b411b363 3669
fbc12f45 3670 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
1ec861eb 3671 drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
fbc12f45
AG
3672 goto disconnect_rcu_unlock;
3673 }
b411b363 3674
fbc12f45 3675 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
1ec861eb 3676 drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
fbc12f45
AG
3677 goto disconnect_rcu_unlock;
3678 }
b411b363 3679
fbc12f45 3680 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
1ec861eb 3681 drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
fbc12f45
AG
3682 goto disconnect_rcu_unlock;
3683 }
b411b363 3684
fbc12f45 3685 if (p_discard_my_data && nc->discard_my_data) {
1ec861eb 3686 drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
fbc12f45
AG
3687 goto disconnect_rcu_unlock;
3688 }
b411b363 3689
fbc12f45 3690 if (p_two_primaries != nc->two_primaries) {
1ec861eb 3691 drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
fbc12f45
AG
3692 goto disconnect_rcu_unlock;
3693 }
b411b363 3694
fbc12f45 3695 if (strcmp(integrity_alg, nc->integrity_alg)) {
1ec861eb 3696 drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
fbc12f45
AG
3697 goto disconnect_rcu_unlock;
3698 }
b411b363 3699
fbc12f45 3700 rcu_read_unlock();
b411b363
PR
3701 }
3702
7d4c782c
AG
3703 if (integrity_alg[0]) {
3704 int hash_size;
3705
3706 /*
3707 * We can only change the peer data integrity algorithm
3708 * here. Changing our own data integrity algorithm
3709 * requires that we send a P_PROTOCOL_UPDATE packet at
3710 * the same time; otherwise, the peer has no way to
3711 * tell between which packets the algorithm should
3712 * change.
3713 */
b411b363 3714
3d234b33 3715 peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0);
1b57e663
LE
3716 if (IS_ERR(peer_integrity_tfm)) {
3717 peer_integrity_tfm = NULL;
1ec861eb 3718 drbd_err(connection, "peer data-integrity-alg %s not supported\n",
7d4c782c
AG
3719 integrity_alg);
3720 goto disconnect;
3721 }
b411b363 3722
3d0e6375 3723 hash_size = crypto_shash_digestsize(peer_integrity_tfm);
7d4c782c
AG
3724 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3725 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3726 if (!(int_dig_in && int_dig_vv)) {
1ec861eb 3727 drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
b411b363
PR
3728 goto disconnect;
3729 }
b411b363
PR
3730 }
3731
7d4c782c 3732 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
8404e191 3733 if (!new_net_conf)
7d4c782c 3734 goto disconnect;
7d4c782c 3735
bde89a9e 3736 mutex_lock(&connection->data.mutex);
0500813f 3737 mutex_lock(&connection->resource->conf_update);
bde89a9e 3738 old_net_conf = connection->net_conf;
7d4c782c
AG
3739 *new_net_conf = *old_net_conf;
3740
3741 new_net_conf->wire_protocol = p_proto;
3742 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3743 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3744 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3745 new_net_conf->two_primaries = p_two_primaries;
3746
bde89a9e 3747 rcu_assign_pointer(connection->net_conf, new_net_conf);
0500813f 3748 mutex_unlock(&connection->resource->conf_update);
bde89a9e 3749 mutex_unlock(&connection->data.mutex);
7d4c782c 3750
3d0e6375 3751 crypto_free_shash(connection->peer_integrity_tfm);
bde89a9e
AG
3752 kfree(connection->int_dig_in);
3753 kfree(connection->int_dig_vv);
3754 connection->peer_integrity_tfm = peer_integrity_tfm;
3755 connection->int_dig_in = int_dig_in;
3756 connection->int_dig_vv = int_dig_vv;
7d4c782c
AG
3757
3758 if (strcmp(old_net_conf->integrity_alg, integrity_alg))
1ec861eb 3759 drbd_info(connection, "peer data-integrity-alg: %s\n",
7d4c782c
AG
3760 integrity_alg[0] ? integrity_alg : "(none)");
3761
90c6c291 3762 kvfree_rcu(old_net_conf);
82bc0194 3763 return 0;
b411b363 3764
44ed167d
PR
3765disconnect_rcu_unlock:
3766 rcu_read_unlock();
b411b363 3767disconnect:
3d0e6375 3768 crypto_free_shash(peer_integrity_tfm);
036b17ea
PR
3769 kfree(int_dig_in);
3770 kfree(int_dig_vv);
bde89a9e 3771 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3772 return -EIO;
b411b363
PR
3773}
3774
3775/* helper function
3776 * input: alg name, feature name
3777 * return: NULL (alg name was "")
3778 * ERR_PTR(error) if something goes wrong
3779 * or the crypto hash ptr, if it worked out ok. */
3d0e6375
KC
3780static struct crypto_shash *drbd_crypto_alloc_digest_safe(
3781 const struct drbd_device *device,
b411b363
PR
3782 const char *alg, const char *name)
3783{
3d0e6375 3784 struct crypto_shash *tfm;
b411b363
PR
3785
3786 if (!alg[0])
3787 return NULL;
3788
3d0e6375 3789 tfm = crypto_alloc_shash(alg, 0, 0);
b411b363 3790 if (IS_ERR(tfm)) {
d0180171 3791 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
b411b363
PR
3792 alg, name, PTR_ERR(tfm));
3793 return tfm;
3794 }
b411b363
PR
3795 return tfm;
3796}
3797
bde89a9e 3798static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
4a76b161 3799{
bde89a9e 3800 void *buffer = connection->data.rbuf;
4a76b161
AG
3801 int size = pi->size;
3802
3803 while (size) {
3804 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
bde89a9e 3805 s = drbd_recv(connection, buffer, s);
4a76b161
AG
3806 if (s <= 0) {
3807 if (s < 0)
3808 return s;
3809 break;
3810 }
3811 size -= s;
3812 }
3813 if (size)
3814 return -EIO;
3815 return 0;
3816}
3817
3818/*
3819 * config_unknown_volume - device configuration command for unknown volume
3820 *
3821 * When a device is added to an existing connection, the node on which the
3822 * device is added first will send configuration commands to its peer but the
3823 * peer will not know about the device yet. It will warn and ignore these
3824 * commands. Once the device is added on the second node, the second node will
3825 * send the same device configuration commands, but in the other direction.
3826 *
3827 * (We can also end up here if drbd is misconfigured.)
3828 */
bde89a9e 3829static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
4a76b161 3830{
1ec861eb 3831 drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
2fcb8f30 3832 cmdname(pi->cmd), pi->vnr);
bde89a9e 3833 return ignore_remaining_packet(connection, pi);
4a76b161
AG
3834}
3835
bde89a9e 3836static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3837{
9f4fe9ad 3838 struct drbd_peer_device *peer_device;
b30ab791 3839 struct drbd_device *device;
e658983a 3840 struct p_rs_param_95 *p;
b411b363 3841 unsigned int header_size, data_size, exp_max_sz;
3d0e6375
KC
3842 struct crypto_shash *verify_tfm = NULL;
3843 struct crypto_shash *csums_tfm = NULL;
2ec91e0e 3844 struct net_conf *old_net_conf, *new_net_conf = NULL;
813472ce 3845 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
bde89a9e 3846 const int apv = connection->agreed_pro_version;
813472ce 3847 struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
6a365874 3848 unsigned int fifo_size = 0;
82bc0194 3849 int err;
b411b363 3850
9f4fe9ad
AG
3851 peer_device = conn_peer_device(connection, pi->vnr);
3852 if (!peer_device)
bde89a9e 3853 return config_unknown_volume(connection, pi);
9f4fe9ad 3854 device = peer_device->device;
b411b363
PR
3855
3856 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
3857 : apv == 88 ? sizeof(struct p_rs_param)
3858 + SHARED_SECRET_MAX
8e26f9cc
PR
3859 : apv <= 94 ? sizeof(struct p_rs_param_89)
3860 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 3861
e2857216 3862 if (pi->size > exp_max_sz) {
d0180171 3863 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
e2857216 3864 pi->size, exp_max_sz);
82bc0194 3865 return -EIO;
b411b363
PR
3866 }
3867
3868 if (apv <= 88) {
e658983a 3869 header_size = sizeof(struct p_rs_param);
e2857216 3870 data_size = pi->size - header_size;
8e26f9cc 3871 } else if (apv <= 94) {
e658983a 3872 header_size = sizeof(struct p_rs_param_89);
e2857216 3873 data_size = pi->size - header_size;
0b0ba1ef 3874 D_ASSERT(device, data_size == 0);
8e26f9cc 3875 } else {
e658983a 3876 header_size = sizeof(struct p_rs_param_95);
e2857216 3877 data_size = pi->size - header_size;
0b0ba1ef 3878 D_ASSERT(device, data_size == 0);
b411b363
PR
3879 }
3880
3881 /* initialize verify_alg and csums_alg */
e658983a 3882 p = pi->data;
52a0cab3
KC
3883 BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX);
3884 memset(&p->algs, 0, sizeof(p->algs));
b411b363 3885
9f4fe9ad 3886 err = drbd_recv_all(peer_device->connection, p, header_size);
82bc0194
AG
3887 if (err)
3888 return err;
b411b363 3889
0500813f 3890 mutex_lock(&connection->resource->conf_update);
9f4fe9ad 3891 old_net_conf = peer_device->connection->net_conf;
b30ab791 3892 if (get_ldev(device)) {
813472ce
PR
3893 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3894 if (!new_disk_conf) {
b30ab791 3895 put_ldev(device);
0500813f 3896 mutex_unlock(&connection->resource->conf_update);
d0180171 3897 drbd_err(device, "Allocation of new disk_conf failed\n");
813472ce
PR
3898 return -ENOMEM;
3899 }
daeda1cc 3900
b30ab791 3901 old_disk_conf = device->ldev->disk_conf;
813472ce 3902 *new_disk_conf = *old_disk_conf;
b411b363 3903
6394b935 3904 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
813472ce 3905 }
b411b363
PR
3906
3907 if (apv >= 88) {
3908 if (apv == 88) {
5de73827 3909 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
d0180171 3910 drbd_err(device, "verify-alg of wrong size, "
5de73827
PR
3911 "peer wants %u, accepting only up to %u byte\n",
3912 data_size, SHARED_SECRET_MAX);
813472ce 3913 goto reconnect;
b411b363
PR
3914 }
3915
9f4fe9ad 3916 err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
813472ce
PR
3917 if (err)
3918 goto reconnect;
b411b363
PR
3919 /* we expect NUL terminated string */
3920 /* but just in case someone tries to be evil */
0b0ba1ef 3921 D_ASSERT(device, p->verify_alg[data_size-1] == 0);
b411b363
PR
3922 p->verify_alg[data_size-1] = 0;
3923
3924 } else /* apv >= 89 */ {
3925 /* we still expect NUL terminated strings */
3926 /* but just in case someone tries to be evil */
0b0ba1ef
AG
3927 D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3928 D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
b411b363
PR
3929 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3930 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3931 }
3932
2ec91e0e 3933 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
b30ab791 3934 if (device->state.conn == C_WF_REPORT_PARAMS) {
d0180171 3935 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3936 old_net_conf->verify_alg, p->verify_alg);
b411b363
PR
3937 goto disconnect;
3938 }
b30ab791 3939 verify_tfm = drbd_crypto_alloc_digest_safe(device,
b411b363
PR
3940 p->verify_alg, "verify-alg");
3941 if (IS_ERR(verify_tfm)) {
3942 verify_tfm = NULL;
3943 goto disconnect;
3944 }
3945 }
3946
2ec91e0e 3947 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
b30ab791 3948 if (device->state.conn == C_WF_REPORT_PARAMS) {
d0180171 3949 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3950 old_net_conf->csums_alg, p->csums_alg);
b411b363
PR
3951 goto disconnect;
3952 }
b30ab791 3953 csums_tfm = drbd_crypto_alloc_digest_safe(device,
b411b363
PR
3954 p->csums_alg, "csums-alg");
3955 if (IS_ERR(csums_tfm)) {
3956 csums_tfm = NULL;
3957 goto disconnect;
3958 }
3959 }
3960
813472ce 3961 if (apv > 94 && new_disk_conf) {
daeda1cc
PR
3962 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3963 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3964 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3965 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
778f271d 3966
daeda1cc 3967 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
b30ab791 3968 if (fifo_size != device->rs_plan_s->size) {
813472ce
PR
3969 new_plan = fifo_alloc(fifo_size);
3970 if (!new_plan) {
d0180171 3971 drbd_err(device, "kmalloc of fifo_buffer failed");
b30ab791 3972 put_ldev(device);
778f271d
PR
3973 goto disconnect;
3974 }
3975 }
8e26f9cc 3976 }
b411b363 3977
91fd4dad 3978 if (verify_tfm || csums_tfm) {
2ec91e0e 3979 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
8404e191 3980 if (!new_net_conf)
91fd4dad 3981 goto disconnect;
91fd4dad 3982
2ec91e0e 3983 *new_net_conf = *old_net_conf;
91fd4dad
PR
3984
3985 if (verify_tfm) {
2ec91e0e
PR
3986 strcpy(new_net_conf->verify_alg, p->verify_alg);
3987 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3d0e6375 3988 crypto_free_shash(peer_device->connection->verify_tfm);
9f4fe9ad 3989 peer_device->connection->verify_tfm = verify_tfm;
d0180171 3990 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
91fd4dad
PR
3991 }
3992 if (csums_tfm) {
2ec91e0e
PR
3993 strcpy(new_net_conf->csums_alg, p->csums_alg);
3994 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3d0e6375 3995 crypto_free_shash(peer_device->connection->csums_tfm);
9f4fe9ad 3996 peer_device->connection->csums_tfm = csums_tfm;
d0180171 3997 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
91fd4dad 3998 }
bde89a9e 3999 rcu_assign_pointer(connection->net_conf, new_net_conf);
778f271d 4000 }
b411b363
PR
4001 }
4002
813472ce 4003 if (new_disk_conf) {
b30ab791
AG
4004 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4005 put_ldev(device);
813472ce
PR
4006 }
4007
4008 if (new_plan) {
b30ab791
AG
4009 old_plan = device->rs_plan_s;
4010 rcu_assign_pointer(device->rs_plan_s, new_plan);
b411b363 4011 }
daeda1cc 4012
0500813f 4013 mutex_unlock(&connection->resource->conf_update);
daeda1cc
PR
4014 synchronize_rcu();
4015 if (new_net_conf)
4016 kfree(old_net_conf);
4017 kfree(old_disk_conf);
813472ce 4018 kfree(old_plan);
daeda1cc 4019
82bc0194 4020 return 0;
b411b363 4021
813472ce
PR
4022reconnect:
4023 if (new_disk_conf) {
b30ab791 4024 put_ldev(device);
813472ce
PR
4025 kfree(new_disk_conf);
4026 }
0500813f 4027 mutex_unlock(&connection->resource->conf_update);
813472ce
PR
4028 return -EIO;
4029
b411b363 4030disconnect:
813472ce
PR
4031 kfree(new_plan);
4032 if (new_disk_conf) {
b30ab791 4033 put_ldev(device);
813472ce
PR
4034 kfree(new_disk_conf);
4035 }
0500813f 4036 mutex_unlock(&connection->resource->conf_update);
b411b363
PR
4037 /* just for completeness: actually not needed,
4038 * as this is not reached if csums_tfm was ok. */
3d0e6375 4039 crypto_free_shash(csums_tfm);
b411b363 4040 /* but free the verify_tfm again, if csums_tfm did not work out */
3d0e6375 4041 crypto_free_shash(verify_tfm);
9f4fe9ad 4042 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4043 return -EIO;
b411b363
PR
4044}
4045
b411b363 4046/* warn if the arguments differ by more than 12.5% */
b30ab791 4047static void warn_if_differ_considerably(struct drbd_device *device,
b411b363
PR
4048 const char *s, sector_t a, sector_t b)
4049{
4050 sector_t d;
4051 if (a == 0 || b == 0)
4052 return;
4053 d = (a > b) ? (a - b) : (b - a);
4054 if (d > (a>>3) || d > (b>>3))
d0180171 4055 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
b411b363
PR
4056 (unsigned long long)a, (unsigned long long)b);
4057}
4058
bde89a9e 4059static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4060{
9f4fe9ad 4061 struct drbd_peer_device *peer_device;
b30ab791 4062 struct drbd_device *device;
e658983a 4063 struct p_sizes *p = pi->data;
9104d31a 4064 struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
e96c9633 4065 enum determine_dev_size dd = DS_UNCHANGED;
6a8d68b1 4066 sector_t p_size, p_usize, p_csize, my_usize;
94c43a13 4067 sector_t new_size, cur_size;
b411b363 4068 int ldsc = 0; /* local disk size changed */
e89b591c 4069 enum dds_flags ddsf;
b411b363 4070
9f4fe9ad
AG
4071 peer_device = conn_peer_device(connection, pi->vnr);
4072 if (!peer_device)
bde89a9e 4073 return config_unknown_volume(connection, pi);
9f4fe9ad 4074 device = peer_device->device;
155bd9d1 4075 cur_size = get_capacity(device->vdisk);
4a76b161 4076
b411b363
PR
4077 p_size = be64_to_cpu(p->d_size);
4078 p_usize = be64_to_cpu(p->u_size);
6a8d68b1 4079 p_csize = be64_to_cpu(p->c_size);
b411b363 4080
b411b363
PR
4081 /* just store the peer's disk size for now.
4082 * we still need to figure out whether we accept that. */
b30ab791 4083 device->p_size = p_size;
b411b363 4084
b30ab791 4085 if (get_ldev(device)) {
daeda1cc 4086 rcu_read_lock();
b30ab791 4087 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
daeda1cc
PR
4088 rcu_read_unlock();
4089
b30ab791
AG
4090 warn_if_differ_considerably(device, "lower level device sizes",
4091 p_size, drbd_get_max_capacity(device->ldev));
4092 warn_if_differ_considerably(device, "user requested size",
daeda1cc 4093 p_usize, my_usize);
b411b363
PR
4094
4095 /* if this is the first connect, or an otherwise expected
4096 * param exchange, choose the minimum */
b30ab791 4097 if (device->state.conn == C_WF_REPORT_PARAMS)
daeda1cc 4098 p_usize = min_not_zero(my_usize, p_usize);
b411b363 4099
ad6e8979
LE
4100 /* Never shrink a device with usable data during connect,
4101 * or "attach" on the peer.
4102 * But allow online shrinking if we are connected. */
60bac040 4103 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
60bac040 4104 if (new_size < cur_size &&
b30ab791 4105 device->state.disk >= D_OUTDATED &&
ad6e8979 4106 (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) {
60bac040
LE
4107 drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
4108 (unsigned long long)new_size, (unsigned long long)cur_size);
9f4fe9ad 4109 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
b30ab791 4110 put_ldev(device);
82bc0194 4111 return -EIO;
b411b363 4112 }
daeda1cc
PR
4113
4114 if (my_usize != p_usize) {
4115 struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
4116
4117 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
4118 if (!new_disk_conf) {
b30ab791 4119 put_ldev(device);
daeda1cc
PR
4120 return -ENOMEM;
4121 }
4122
0500813f 4123 mutex_lock(&connection->resource->conf_update);
b30ab791 4124 old_disk_conf = device->ldev->disk_conf;
daeda1cc
PR
4125 *new_disk_conf = *old_disk_conf;
4126 new_disk_conf->disk_size = p_usize;
4127
b30ab791 4128 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
0500813f 4129 mutex_unlock(&connection->resource->conf_update);
90c6c291 4130 kvfree_rcu(old_disk_conf);
daeda1cc 4131
ad6e8979
LE
4132 drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n",
4133 (unsigned long)p_usize, (unsigned long)my_usize);
b411b363 4134 }
daeda1cc 4135
b30ab791 4136 put_ldev(device);
b411b363 4137 }
b411b363 4138
20c68fde 4139 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
dd4f699d 4140 /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
20c68fde 4141 In case we cleared the QUEUE_FLAG_DISCARD from our queue in
dd4f699d 4142 drbd_reconsider_queue_parameters(), we can be sure that after
20c68fde
LE
4143 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
4144
e89b591c 4145 ddsf = be16_to_cpu(p->dds_flags);
b30ab791 4146 if (get_ldev(device)) {
9104d31a 4147 drbd_reconsider_queue_parameters(device, device->ldev, o);
b30ab791
AG
4148 dd = drbd_determine_dev_size(device, ddsf, NULL);
4149 put_ldev(device);
e96c9633 4150 if (dd == DS_ERROR)
82bc0194 4151 return -EIO;
b30ab791 4152 drbd_md_sync(device);
b411b363 4153 } else {
6a8d68b1
LE
4154 /*
4155 * I am diskless, need to accept the peer's *current* size.
4156 * I must NOT accept the peers backing disk size,
4157 * it may have been larger than mine all along...
4158 *
4159 * At this point, the peer knows more about my disk, or at
4160 * least about what we last agreed upon, than myself.
4161 * So if his c_size is less than his d_size, the most likely
4162 * reason is that *my* d_size was smaller last time we checked.
4163 *
4164 * However, if he sends a zero current size,
4165 * take his (user-capped or) backing disk size anyways.
94c43a13
LE
4166 *
4167 * Unless of course he does not have a disk himself.
4168 * In which case we ignore this completely.
6a8d68b1 4169 */
94c43a13 4170 sector_t new_size = p_csize ?: p_usize ?: p_size;
9104d31a 4171 drbd_reconsider_queue_parameters(device, NULL, o);
94c43a13
LE
4172 if (new_size == 0) {
4173 /* Ignore, peer does not know nothing. */
4174 } else if (new_size == cur_size) {
4175 /* nothing to do */
4176 } else if (cur_size != 0 && p_size == 0) {
4177 drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
4178 (unsigned long long)new_size, (unsigned long long)cur_size);
4179 } else if (new_size < cur_size && device->state.role == R_PRIMARY) {
4180 drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
4181 (unsigned long long)new_size, (unsigned long long)cur_size);
4182 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4183 return -EIO;
4184 } else {
4185 /* I believe the peer, if
4186 * - I don't have a current size myself
4187 * - we agree on the size anyways
4188 * - I do have a current size, am Secondary,
4189 * and he has the only disk
4190 * - I do have a current size, am Primary,
4191 * and he has the only disk,
4192 * which is larger than my current size
4193 */
4194 drbd_set_my_capacity(device, new_size);
4195 }
b411b363
PR
4196 }
4197
b30ab791
AG
4198 if (get_ldev(device)) {
4199 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
4200 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
b411b363
PR
4201 ldsc = 1;
4202 }
4203
b30ab791 4204 put_ldev(device);
b411b363
PR
4205 }
4206
b30ab791 4207 if (device->state.conn > C_WF_REPORT_PARAMS) {
155bd9d1
CH
4208 if (be64_to_cpu(p->c_size) != get_capacity(device->vdisk) ||
4209 ldsc) {
b411b363
PR
4210 /* we have different sizes, probably peer
4211 * needs to know my new size... */
69a22773 4212 drbd_send_sizes(peer_device, 0, ddsf);
b411b363 4213 }
b30ab791
AG
4214 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
4215 (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
4216 if (device->state.pdsk >= D_INCONSISTENT &&
4217 device->state.disk >= D_INCONSISTENT) {
e89b591c 4218 if (ddsf & DDSF_NO_RESYNC)
d0180171 4219 drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
e89b591c 4220 else
b30ab791 4221 resync_after_online_grow(device);
e89b591c 4222 } else
b30ab791 4223 set_bit(RESYNC_AFTER_NEG, &device->flags);
b411b363
PR
4224 }
4225 }
4226
82bc0194 4227 return 0;
b411b363
PR
4228}
4229
bde89a9e 4230static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4231{
9f4fe9ad 4232 struct drbd_peer_device *peer_device;
b30ab791 4233 struct drbd_device *device;
e658983a 4234 struct p_uuids *p = pi->data;
b411b363 4235 u64 *p_uuid;
62b0da3a 4236 int i, updated_uuids = 0;
b411b363 4237
9f4fe9ad
AG
4238 peer_device = conn_peer_device(connection, pi->vnr);
4239 if (!peer_device)
bde89a9e 4240 return config_unknown_volume(connection, pi);
9f4fe9ad 4241 device = peer_device->device;
4a76b161 4242
365cf663 4243 p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO);
8404e191 4244 if (!p_uuid)
063eacf8 4245 return false;
b411b363
PR
4246
4247 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
4248 p_uuid[i] = be64_to_cpu(p->uuid[i]);
4249
b30ab791
AG
4250 kfree(device->p_uuid);
4251 device->p_uuid = p_uuid;
b411b363 4252
b17b5960 4253 if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
b30ab791
AG
4254 device->state.disk < D_INCONSISTENT &&
4255 device->state.role == R_PRIMARY &&
4256 (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
d0180171 4257 drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
b30ab791 4258 (unsigned long long)device->ed_uuid);
9f4fe9ad 4259 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4260 return -EIO;
b411b363
PR
4261 }
4262
b30ab791 4263 if (get_ldev(device)) {
b411b363 4264 int skip_initial_sync =
b30ab791 4265 device->state.conn == C_CONNECTED &&
9f4fe9ad 4266 peer_device->connection->agreed_pro_version >= 90 &&
b30ab791 4267 device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
b411b363
PR
4268 (p_uuid[UI_FLAGS] & 8);
4269 if (skip_initial_sync) {
d0180171 4270 drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
b30ab791 4271 drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
20ceb2b2
LE
4272 "clear_n_write from receive_uuids",
4273 BM_LOCKED_TEST_ALLOWED);
b30ab791
AG
4274 _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
4275 _drbd_uuid_set(device, UI_BITMAP, 0);
4276 _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
b411b363 4277 CS_VERBOSE, NULL);
b30ab791 4278 drbd_md_sync(device);
62b0da3a 4279 updated_uuids = 1;
b411b363 4280 }
b30ab791
AG
4281 put_ldev(device);
4282 } else if (device->state.disk < D_INCONSISTENT &&
4283 device->state.role == R_PRIMARY) {
18a50fa2
PR
4284 /* I am a diskless primary, the peer just created a new current UUID
4285 for me. */
b30ab791 4286 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
b411b363
PR
4287 }
4288
4289 /* Before we test for the disk state, we should wait until an eventually
4290 ongoing cluster wide state change is finished. That is important if
4291 we are primary and are detaching from our disk. We need to see the
4292 new disk state... */
b30ab791
AG
4293 mutex_lock(device->state_mutex);
4294 mutex_unlock(device->state_mutex);
4295 if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
4296 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
62b0da3a
LE
4297
4298 if (updated_uuids)
b30ab791 4299 drbd_print_uuids(device, "receiver updated UUIDs to");
b411b363 4300
82bc0194 4301 return 0;
b411b363
PR
4302}
4303
4304/**
4305 * convert_state() - Converts the peer's view of the cluster state to our point of view
4306 * @ps: The state as seen by the peer.
4307 */
4308static union drbd_state convert_state(union drbd_state ps)
4309{
4310 union drbd_state ms;
4311
4312 static enum drbd_conns c_tab[] = {
369bea63 4313 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
b411b363
PR
4314 [C_CONNECTED] = C_CONNECTED,
4315
4316 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
4317 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
4318 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
4319 [C_VERIFY_S] = C_VERIFY_T,
4320 [C_MASK] = C_MASK,
4321 };
4322
4323 ms.i = ps.i;
4324
4325 ms.conn = c_tab[ps.conn];
4326 ms.peer = ps.role;
4327 ms.role = ps.peer;
4328 ms.pdsk = ps.disk;
4329 ms.disk = ps.pdsk;
4330 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
4331
4332 return ms;
4333}
4334
bde89a9e 4335static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4336{
9f4fe9ad 4337 struct drbd_peer_device *peer_device;
b30ab791 4338 struct drbd_device *device;
e658983a 4339 struct p_req_state *p = pi->data;
b411b363 4340 union drbd_state mask, val;
bf885f8a 4341 enum drbd_state_rv rv;
b411b363 4342
9f4fe9ad
AG
4343 peer_device = conn_peer_device(connection, pi->vnr);
4344 if (!peer_device)
4a76b161 4345 return -EIO;
9f4fe9ad 4346 device = peer_device->device;
4a76b161 4347
b411b363
PR
4348 mask.i = be32_to_cpu(p->mask);
4349 val.i = be32_to_cpu(p->val);
4350
9f4fe9ad 4351 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
b30ab791 4352 mutex_is_locked(device->state_mutex)) {
69a22773 4353 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
82bc0194 4354 return 0;
b411b363
PR
4355 }
4356
4357 mask = convert_state(mask);
4358 val = convert_state(val);
4359
b30ab791 4360 rv = drbd_change_state(device, CS_VERBOSE, mask, val);
69a22773 4361 drbd_send_sr_reply(peer_device, rv);
b411b363 4362
b30ab791 4363 drbd_md_sync(device);
b411b363 4364
82bc0194 4365 return 0;
b411b363
PR
4366}
4367
bde89a9e 4368static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4369{
e658983a 4370 struct p_req_state *p = pi->data;
b411b363 4371 union drbd_state mask, val;
bf885f8a 4372 enum drbd_state_rv rv;
b411b363 4373
b411b363
PR
4374 mask.i = be32_to_cpu(p->mask);
4375 val.i = be32_to_cpu(p->val);
4376
bde89a9e
AG
4377 if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4378 mutex_is_locked(&connection->cstate_mutex)) {
4379 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
82bc0194 4380 return 0;
b411b363
PR
4381 }
4382
4383 mask = convert_state(mask);
4384 val = convert_state(val);
4385
bde89a9e
AG
4386 rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4387 conn_send_sr_reply(connection, rv);
b411b363 4388
82bc0194 4389 return 0;
b411b363
PR
4390}
4391
bde89a9e 4392static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4393{
9f4fe9ad 4394 struct drbd_peer_device *peer_device;
b30ab791 4395 struct drbd_device *device;
e658983a 4396 struct p_state *p = pi->data;
4ac4aada 4397 union drbd_state os, ns, peer_state;
b411b363 4398 enum drbd_disk_state real_peer_disk;
65d922c3 4399 enum chg_state_flags cs_flags;
b411b363
PR
4400 int rv;
4401
9f4fe9ad
AG
4402 peer_device = conn_peer_device(connection, pi->vnr);
4403 if (!peer_device)
bde89a9e 4404 return config_unknown_volume(connection, pi);
9f4fe9ad 4405 device = peer_device->device;
4a76b161 4406
b411b363
PR
4407 peer_state.i = be32_to_cpu(p->state);
4408
4409 real_peer_disk = peer_state.disk;
4410 if (peer_state.disk == D_NEGOTIATING) {
b30ab791 4411 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
d0180171 4412 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
b411b363
PR
4413 }
4414
0500813f 4415 spin_lock_irq(&device->resource->req_lock);
b411b363 4416 retry:
b30ab791 4417 os = ns = drbd_read_state(device);
0500813f 4418 spin_unlock_irq(&device->resource->req_lock);
b411b363 4419
668700b4 4420 /* If some other part of the code (ack_receiver thread, timeout)
545752d5
LE
4421 * already decided to close the connection again,
4422 * we must not "re-establish" it here. */
4423 if (os.conn <= C_TEAR_DOWN)
58ffa580 4424 return -ECONNRESET;
545752d5 4425
40424e4a
LE
4426 /* If this is the "end of sync" confirmation, usually the peer disk
4427 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4428 * set) resync started in PausedSyncT, or if the timing of pause-/
4429 * unpause-sync events has been "just right", the peer disk may
4430 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4431 */
4432 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4433 real_peer_disk == D_UP_TO_DATE &&
e9ef7bb6
LE
4434 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4435 /* If we are (becoming) SyncSource, but peer is still in sync
4436 * preparation, ignore its uptodate-ness to avoid flapping, it
4437 * will change to inconsistent once the peer reaches active
4438 * syncing states.
4439 * It may have changed syncer-paused flags, however, so we
4440 * cannot ignore this completely. */
4441 if (peer_state.conn > C_CONNECTED &&
4442 peer_state.conn < C_SYNC_SOURCE)
4443 real_peer_disk = D_INCONSISTENT;
4444
4445 /* if peer_state changes to connected at the same time,
4446 * it explicitly notifies us that it finished resync.
4447 * Maybe we should finish it up, too? */
4448 else if (os.conn >= C_SYNC_SOURCE &&
4449 peer_state.conn == C_CONNECTED) {
b30ab791
AG
4450 if (drbd_bm_total_weight(device) <= device->rs_failed)
4451 drbd_resync_finished(device);
82bc0194 4452 return 0;
e9ef7bb6
LE
4453 }
4454 }
4455
02b91b55
LE
4456 /* explicit verify finished notification, stop sector reached. */
4457 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4458 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
b30ab791
AG
4459 ov_out_of_sync_print(device);
4460 drbd_resync_finished(device);
58ffa580 4461 return 0;
02b91b55
LE
4462 }
4463
e9ef7bb6
LE
4464 /* peer says his disk is inconsistent, while we think it is uptodate,
4465 * and this happens while the peer still thinks we have a sync going on,
4466 * but we think we are already done with the sync.
4467 * We ignore this to avoid flapping pdsk.
4468 * This should not happen, if the peer is a recent version of drbd. */
4469 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4470 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4471 real_peer_disk = D_UP_TO_DATE;
4472
4ac4aada
LE
4473 if (ns.conn == C_WF_REPORT_PARAMS)
4474 ns.conn = C_CONNECTED;
b411b363 4475
67531718
PR
4476 if (peer_state.conn == C_AHEAD)
4477 ns.conn = C_BEHIND;
4478
fe43ed97
LE
4479 /* TODO:
4480 * if (primary and diskless and peer uuid != effective uuid)
4481 * abort attach on peer;
4482 *
4483 * If this node does not have good data, was already connected, but
4484 * the peer did a late attach only now, trying to "negotiate" with me,
4485 * AND I am currently Primary, possibly frozen, with some specific
4486 * "effective" uuid, this should never be reached, really, because
4487 * we first send the uuids, then the current state.
4488 *
4489 * In this scenario, we already dropped the connection hard
4490 * when we received the unsuitable uuids (receive_uuids().
4491 *
4492 * Should we want to change this, that is: not drop the connection in
4493 * receive_uuids() already, then we would need to add a branch here
4494 * that aborts the attach of "unsuitable uuids" on the peer in case
4495 * this node is currently Diskless Primary.
4496 */
4497
b30ab791
AG
4498 if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4499 get_ldev_if_state(device, D_NEGOTIATING)) {
b411b363
PR
4500 int cr; /* consider resync */
4501
4502 /* if we established a new connection */
4ac4aada 4503 cr = (os.conn < C_CONNECTED);
b411b363
PR
4504 /* if we had an established connection
4505 * and one of the nodes newly attaches a disk */
4ac4aada 4506 cr |= (os.conn == C_CONNECTED &&
b411b363 4507 (peer_state.disk == D_NEGOTIATING ||
4ac4aada 4508 os.disk == D_NEGOTIATING));
b411b363 4509 /* if we have both been inconsistent, and the peer has been
a2823ea9 4510 * forced to be UpToDate with --force */
b30ab791 4511 cr |= test_bit(CONSIDER_RESYNC, &device->flags);
b411b363
PR
4512 /* if we had been plain connected, and the admin requested to
4513 * start a sync by "invalidate" or "invalidate-remote" */
4ac4aada 4514 cr |= (os.conn == C_CONNECTED &&
b411b363
PR
4515 (peer_state.conn >= C_STARTING_SYNC_S &&
4516 peer_state.conn <= C_WF_BITMAP_T));
4517
4518 if (cr)
69a22773 4519 ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
b411b363 4520
b30ab791 4521 put_ldev(device);
4ac4aada
LE
4522 if (ns.conn == C_MASK) {
4523 ns.conn = C_CONNECTED;
b30ab791
AG
4524 if (device->state.disk == D_NEGOTIATING) {
4525 drbd_force_state(device, NS(disk, D_FAILED));
b411b363 4526 } else if (peer_state.disk == D_NEGOTIATING) {
d0180171 4527 drbd_err(device, "Disk attach process on the peer node was aborted.\n");
b411b363 4528 peer_state.disk = D_DISKLESS;
580b9767 4529 real_peer_disk = D_DISKLESS;
b411b363 4530 } else {
9f4fe9ad 4531 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
82bc0194 4532 return -EIO;
0b0ba1ef 4533 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
9f4fe9ad 4534 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4535 return -EIO;
b411b363
PR
4536 }
4537 }
4538 }
4539
0500813f 4540 spin_lock_irq(&device->resource->req_lock);
b30ab791 4541 if (os.i != drbd_read_state(device).i)
b411b363 4542 goto retry;
b30ab791 4543 clear_bit(CONSIDER_RESYNC, &device->flags);
b411b363
PR
4544 ns.peer = peer_state.role;
4545 ns.pdsk = real_peer_disk;
4546 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4ac4aada 4547 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
b30ab791 4548 ns.disk = device->new_state_tmp.disk;
4ac4aada 4549 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
b30ab791
AG
4550 if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4551 test_bit(NEW_CUR_UUID, &device->flags)) {
8554df1c 4552 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
481c6f50 4553 for temporal network outages! */
0500813f 4554 spin_unlock_irq(&device->resource->req_lock);
d0180171 4555 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
9f4fe9ad 4556 tl_clear(peer_device->connection);
b30ab791
AG
4557 drbd_uuid_new_current(device);
4558 clear_bit(NEW_CUR_UUID, &device->flags);
9f4fe9ad 4559 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
82bc0194 4560 return -EIO;
481c6f50 4561 }
b30ab791
AG
4562 rv = _drbd_set_state(device, ns, cs_flags, NULL);
4563 ns = drbd_read_state(device);
0500813f 4564 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
4565
4566 if (rv < SS_SUCCESS) {
9f4fe9ad 4567 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4568 return -EIO;
b411b363
PR
4569 }
4570
4ac4aada
LE
4571 if (os.conn > C_WF_REPORT_PARAMS) {
4572 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
b411b363
PR
4573 peer_state.disk != D_NEGOTIATING ) {
4574 /* we want resync, peer has not yet decided to sync... */
4575 /* Nowadays only used when forcing a node into primary role and
4576 setting its disk to UpToDate with that */
69a22773
AG
4577 drbd_send_uuids(peer_device);
4578 drbd_send_current_state(peer_device);
b411b363
PR
4579 }
4580 }
4581
b30ab791 4582 clear_bit(DISCARD_MY_DATA, &device->flags);
b411b363 4583
b30ab791 4584 drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
b411b363 4585
82bc0194 4586 return 0;
b411b363
PR
4587}
4588
bde89a9e 4589static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4590{
9f4fe9ad 4591 struct drbd_peer_device *peer_device;
b30ab791 4592 struct drbd_device *device;
e658983a 4593 struct p_rs_uuid *p = pi->data;
4a76b161 4594
9f4fe9ad
AG
4595 peer_device = conn_peer_device(connection, pi->vnr);
4596 if (!peer_device)
4a76b161 4597 return -EIO;
9f4fe9ad 4598 device = peer_device->device;
b411b363 4599
b30ab791
AG
4600 wait_event(device->misc_wait,
4601 device->state.conn == C_WF_SYNC_UUID ||
4602 device->state.conn == C_BEHIND ||
4603 device->state.conn < C_CONNECTED ||
4604 device->state.disk < D_NEGOTIATING);
b411b363 4605
0b0ba1ef 4606 /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */
b411b363 4607
b411b363
PR
4608 /* Here the _drbd_uuid_ functions are right, current should
4609 _not_ be rotated into the history */
b30ab791
AG
4610 if (get_ldev_if_state(device, D_NEGOTIATING)) {
4611 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4612 _drbd_uuid_set(device, UI_BITMAP, 0UL);
b411b363 4613
b30ab791
AG
4614 drbd_print_uuids(device, "updated sync uuid");
4615 drbd_start_resync(device, C_SYNC_TARGET);
b411b363 4616
b30ab791 4617 put_ldev(device);
b411b363 4618 } else
d0180171 4619 drbd_err(device, "Ignoring SyncUUID packet!\n");
b411b363 4620
82bc0194 4621 return 0;
b411b363
PR
4622}
4623
9b48ff07 4624/*
2c46407d
AG
4625 * receive_bitmap_plain
4626 *
4627 * Return 0 when done, 1 when another iteration is needed, and a negative error
4628 * code upon failure.
4629 */
4630static int
69a22773 4631receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
e658983a 4632 unsigned long *p, struct bm_xfer_ctx *c)
b411b363 4633{
50d0b1ad 4634 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
69a22773 4635 drbd_header_size(peer_device->connection);
e658983a 4636 unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 4637 c->bm_words - c->word_offset);
e658983a 4638 unsigned int want = num_words * sizeof(*p);
2c46407d 4639 int err;
b411b363 4640
50d0b1ad 4641 if (want != size) {
69a22773 4642 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
2c46407d 4643 return -EIO;
b411b363
PR
4644 }
4645 if (want == 0)
2c46407d 4646 return 0;
69a22773 4647 err = drbd_recv_all(peer_device->connection, p, want);
82bc0194 4648 if (err)
2c46407d 4649 return err;
b411b363 4650
69a22773 4651 drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
b411b363
PR
4652
4653 c->word_offset += num_words;
4654 c->bit_offset = c->word_offset * BITS_PER_LONG;
4655 if (c->bit_offset > c->bm_bits)
4656 c->bit_offset = c->bm_bits;
4657
2c46407d 4658 return 1;
b411b363
PR
4659}
4660
a02d1240
AG
4661static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4662{
4663 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4664}
4665
4666static int dcbp_get_start(struct p_compressed_bm *p)
4667{
4668 return (p->encoding & 0x80) != 0;
4669}
4670
4671static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4672{
4673 return (p->encoding >> 4) & 0x7;
4674}
4675
9b48ff07 4676/*
2c46407d
AG
4677 * recv_bm_rle_bits
4678 *
4679 * Return 0 when done, 1 when another iteration is needed, and a negative error
4680 * code upon failure.
4681 */
4682static int
69a22773 4683recv_bm_rle_bits(struct drbd_peer_device *peer_device,
b411b363 4684 struct p_compressed_bm *p,
c6d25cfe
PR
4685 struct bm_xfer_ctx *c,
4686 unsigned int len)
b411b363
PR
4687{
4688 struct bitstream bs;
4689 u64 look_ahead;
4690 u64 rl;
4691 u64 tmp;
4692 unsigned long s = c->bit_offset;
4693 unsigned long e;
a02d1240 4694 int toggle = dcbp_get_start(p);
b411b363
PR
4695 int have;
4696 int bits;
4697
a02d1240 4698 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
b411b363
PR
4699
4700 bits = bitstream_get_bits(&bs, &look_ahead, 64);
4701 if (bits < 0)
2c46407d 4702 return -EIO;
b411b363
PR
4703
4704 for (have = bits; have > 0; s += rl, toggle = !toggle) {
4705 bits = vli_decode_bits(&rl, look_ahead);
4706 if (bits <= 0)
2c46407d 4707 return -EIO;
b411b363
PR
4708
4709 if (toggle) {
4710 e = s + rl -1;
4711 if (e >= c->bm_bits) {
69a22773 4712 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
2c46407d 4713 return -EIO;
b411b363 4714 }
69a22773 4715 _drbd_bm_set_bits(peer_device->device, s, e);
b411b363
PR
4716 }
4717
4718 if (have < bits) {
69a22773 4719 drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
b411b363
PR
4720 have, bits, look_ahead,
4721 (unsigned int)(bs.cur.b - p->code),
4722 (unsigned int)bs.buf_len);
2c46407d 4723 return -EIO;
b411b363 4724 }
d2da5b0c
LE
4725 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4726 if (likely(bits < 64))
4727 look_ahead >>= bits;
4728 else
4729 look_ahead = 0;
b411b363
PR
4730 have -= bits;
4731
4732 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4733 if (bits < 0)
2c46407d 4734 return -EIO;
b411b363
PR
4735 look_ahead |= tmp << have;
4736 have += bits;
4737 }
4738
4739 c->bit_offset = s;
4740 bm_xfer_ctx_bit_to_word_offset(c);
4741
2c46407d 4742 return (s != c->bm_bits);
b411b363
PR
4743}
4744
9b48ff07 4745/*
2c46407d
AG
4746 * decode_bitmap_c
4747 *
4748 * Return 0 when done, 1 when another iteration is needed, and a negative error
4749 * code upon failure.
4750 */
4751static int
69a22773 4752decode_bitmap_c(struct drbd_peer_device *peer_device,
b411b363 4753 struct p_compressed_bm *p,
c6d25cfe
PR
4754 struct bm_xfer_ctx *c,
4755 unsigned int len)
b411b363 4756{
a02d1240 4757 if (dcbp_get_code(p) == RLE_VLI_Bits)
69a22773 4758 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
b411b363
PR
4759
4760 /* other variants had been implemented for evaluation,
4761 * but have been dropped as this one turned out to be "best"
4762 * during all our tests. */
4763
69a22773
AG
4764 drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4765 conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
2c46407d 4766 return -EIO;
b411b363
PR
4767}
4768
b30ab791 4769void INFO_bm_xfer_stats(struct drbd_device *device,
b411b363
PR
4770 const char *direction, struct bm_xfer_ctx *c)
4771{
4772 /* what would it take to transfer it "plaintext" */
a6b32bc3 4773 unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
50d0b1ad
AG
4774 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4775 unsigned int plain =
4776 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4777 c->bm_words * sizeof(unsigned long);
4778 unsigned int total = c->bytes[0] + c->bytes[1];
4779 unsigned int r;
b411b363
PR
4780
4781 /* total can not be zero. but just in case: */
4782 if (total == 0)
4783 return;
4784
4785 /* don't report if not compressed */
4786 if (total >= plain)
4787 return;
4788
4789 /* total < plain. check for overflow, still */
4790 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4791 : (1000 * total / plain);
4792
4793 if (r > 1000)
4794 r = 1000;
4795
4796 r = 1000 - r;
d0180171 4797 drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
b411b363
PR
4798 "total %u; compression: %u.%u%%\n",
4799 direction,
4800 c->bytes[1], c->packets[1],
4801 c->bytes[0], c->packets[0],
4802 total, r/10, r % 10);
4803}
4804
4805/* Since we are processing the bitfield from lower addresses to higher,
4806 it does not matter if the process it in 32 bit chunks or 64 bit
4807 chunks as long as it is little endian. (Understand it as byte stream,
4808 beginning with the lowest byte...) If we would use big endian
4809 we would need to process it from the highest address to the lowest,
4810 in order to be agnostic to the 32 vs 64 bits issue.
4811
4812 returns 0 on failure, 1 if we successfully received it. */
bde89a9e 4813static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4814{
9f4fe9ad 4815 struct drbd_peer_device *peer_device;
b30ab791 4816 struct drbd_device *device;
b411b363 4817 struct bm_xfer_ctx c;
2c46407d 4818 int err;
4a76b161 4819
9f4fe9ad
AG
4820 peer_device = conn_peer_device(connection, pi->vnr);
4821 if (!peer_device)
4a76b161 4822 return -EIO;
9f4fe9ad 4823 device = peer_device->device;
b411b363 4824
b30ab791 4825 drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
20ceb2b2
LE
4826 /* you are supposed to send additional out-of-sync information
4827 * if you actually set bits during this phase */
b411b363 4828
b411b363 4829 c = (struct bm_xfer_ctx) {
b30ab791
AG
4830 .bm_bits = drbd_bm_bits(device),
4831 .bm_words = drbd_bm_words(device),
b411b363
PR
4832 };
4833
2c46407d 4834 for(;;) {
e658983a 4835 if (pi->cmd == P_BITMAP)
69a22773 4836 err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
e658983a 4837 else if (pi->cmd == P_COMPRESSED_BITMAP) {
b411b363
PR
4838 /* MAYBE: sanity check that we speak proto >= 90,
4839 * and the feature is enabled! */
e658983a 4840 struct p_compressed_bm *p = pi->data;
b411b363 4841
bde89a9e 4842 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
d0180171 4843 drbd_err(device, "ReportCBitmap packet too large\n");
82bc0194 4844 err = -EIO;
b411b363
PR
4845 goto out;
4846 }
e658983a 4847 if (pi->size <= sizeof(*p)) {
d0180171 4848 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
82bc0194 4849 err = -EIO;
78fcbdae 4850 goto out;
b411b363 4851 }
9f4fe9ad 4852 err = drbd_recv_all(peer_device->connection, p, pi->size);
e658983a
AG
4853 if (err)
4854 goto out;
69a22773 4855 err = decode_bitmap_c(peer_device, p, &c, pi->size);
b411b363 4856 } else {
d0180171 4857 drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
82bc0194 4858 err = -EIO;
b411b363
PR
4859 goto out;
4860 }
4861
e2857216 4862 c.packets[pi->cmd == P_BITMAP]++;
bde89a9e 4863 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
b411b363 4864
2c46407d
AG
4865 if (err <= 0) {
4866 if (err < 0)
4867 goto out;
b411b363 4868 break;
2c46407d 4869 }
9f4fe9ad 4870 err = drbd_recv_header(peer_device->connection, pi);
82bc0194 4871 if (err)
b411b363 4872 goto out;
2c46407d 4873 }
b411b363 4874
b30ab791 4875 INFO_bm_xfer_stats(device, "receive", &c);
b411b363 4876
b30ab791 4877 if (device->state.conn == C_WF_BITMAP_T) {
de1f8e4a
AG
4878 enum drbd_state_rv rv;
4879
b30ab791 4880 err = drbd_send_bitmap(device);
82bc0194 4881 if (err)
b411b363
PR
4882 goto out;
4883 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
b30ab791 4884 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
0b0ba1ef 4885 D_ASSERT(device, rv == SS_SUCCESS);
b30ab791 4886 } else if (device->state.conn != C_WF_BITMAP_S) {
b411b363
PR
4887 /* admin may have requested C_DISCONNECTING,
4888 * other threads may have noticed network errors */
d0180171 4889 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
b30ab791 4890 drbd_conn_str(device->state.conn));
b411b363 4891 }
82bc0194 4892 err = 0;
b411b363 4893
b411b363 4894 out:
b30ab791
AG
4895 drbd_bm_unlock(device);
4896 if (!err && device->state.conn == C_WF_BITMAP_S)
4897 drbd_start_resync(device, C_SYNC_SOURCE);
82bc0194 4898 return err;
b411b363
PR
4899}
4900
bde89a9e 4901static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4902{
1ec861eb 4903 drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
e2857216 4904 pi->cmd, pi->size);
b411b363 4905
bde89a9e 4906 return ignore_remaining_packet(connection, pi);
b411b363
PR
4907}
4908
bde89a9e 4909static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
0ced55a3 4910{
e7f52dfb
LE
4911 /* Make sure we've acked all the TCP data associated
4912 * with the data requests being unplugged */
ddd061b8 4913 tcp_sock_set_quickack(connection->data.socket->sk, 2);
82bc0194 4914 return 0;
0ced55a3
PR
4915}
4916
bde89a9e 4917static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
73a01a18 4918{
9f4fe9ad 4919 struct drbd_peer_device *peer_device;
b30ab791 4920 struct drbd_device *device;
e658983a 4921 struct p_block_desc *p = pi->data;
4a76b161 4922
9f4fe9ad
AG
4923 peer_device = conn_peer_device(connection, pi->vnr);
4924 if (!peer_device)
4a76b161 4925 return -EIO;
9f4fe9ad 4926 device = peer_device->device;
73a01a18 4927
b30ab791 4928 switch (device->state.conn) {
f735e363
LE
4929 case C_WF_SYNC_UUID:
4930 case C_WF_BITMAP_T:
4931 case C_BEHIND:
4932 break;
4933 default:
d0180171 4934 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
b30ab791 4935 drbd_conn_str(device->state.conn));
f735e363
LE
4936 }
4937
b30ab791 4938 drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
73a01a18 4939
82bc0194 4940 return 0;
73a01a18
PR
4941}
4942
700ca8c0
PR
4943static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
4944{
4945 struct drbd_peer_device *peer_device;
4946 struct p_block_desc *p = pi->data;
4947 struct drbd_device *device;
4948 sector_t sector;
4949 int size, err = 0;
4950
4951 peer_device = conn_peer_device(connection, pi->vnr);
4952 if (!peer_device)
4953 return -EIO;
4954 device = peer_device->device;
4955
4956 sector = be64_to_cpu(p->sector);
4957 size = be32_to_cpu(p->blksize);
4958
4959 dec_rs_pending(device);
4960
4961 if (get_ldev(device)) {
4962 struct drbd_peer_request *peer_req;
700ca8c0
PR
4963
4964 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
9104d31a 4965 size, 0, GFP_NOIO);
700ca8c0
PR
4966 if (!peer_req) {
4967 put_ldev(device);
4968 return -ENOMEM;
4969 }
4970
4971 peer_req->w.cb = e_end_resync_block;
ce668b6d 4972 peer_req->opf = REQ_OP_DISCARD;
700ca8c0 4973 peer_req->submit_jif = jiffies;
f31e583a 4974 peer_req->flags |= EE_TRIM;
700ca8c0
PR
4975
4976 spin_lock_irq(&device->resource->req_lock);
4977 list_add_tail(&peer_req->w.list, &device->sync_ee);
4978 spin_unlock_irq(&device->resource->req_lock);
4979
4980 atomic_add(pi->size >> 9, &device->rs_sect_ev);
ce668b6d 4981 err = drbd_submit_peer_request(peer_req);
700ca8c0
PR
4982
4983 if (err) {
4984 spin_lock_irq(&device->resource->req_lock);
4985 list_del(&peer_req->w.list);
4986 spin_unlock_irq(&device->resource->req_lock);
4987
4988 drbd_free_peer_req(device, peer_req);
4989 put_ldev(device);
4990 err = 0;
4991 goto fail;
4992 }
4993
4994 inc_unacked(device);
4995
4996 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
4997 as well as drbd_rs_complete_io() */
4998 } else {
4999 fail:
5000 drbd_rs_complete_io(device, sector);
5001 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
5002 }
5003
5004 atomic_add(size >> 9, &device->rs_sect_in);
5005
5006 return err;
5007}
5008
02918be2
PR
5009struct data_cmd {
5010 int expect_payload;
9104d31a 5011 unsigned int pkt_size;
bde89a9e 5012 int (*fn)(struct drbd_connection *, struct packet_info *);
02918be2
PR
5013};
5014
5015static struct data_cmd drbd_cmd_handler[] = {
5016 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
5017 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
5018 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
5019 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
e658983a
AG
5020 [P_BITMAP] = { 1, 0, receive_bitmap } ,
5021 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
5022 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
02918be2
PR
5023 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
5024 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
e658983a
AG
5025 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
5026 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
02918be2
PR
5027 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
5028 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
5029 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
5030 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
5031 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
5032 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
5033 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
5034 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
5035 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
700ca8c0 5036 [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest },
02918be2 5037 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
73a01a18 5038 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4a76b161 5039 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
036b17ea 5040 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
a0fb3c47 5041 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
f31e583a 5042 [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data },
700ca8c0 5043 [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
b411b363
PR
5044};
5045
bde89a9e 5046static void drbdd(struct drbd_connection *connection)
b411b363 5047{
77351055 5048 struct packet_info pi;
02918be2 5049 size_t shs; /* sub header size */
82bc0194 5050 int err;
b411b363 5051
bde89a9e 5052 while (get_t_state(&connection->receiver) == RUNNING) {
9104d31a 5053 struct data_cmd const *cmd;
b411b363 5054
bde89a9e 5055 drbd_thread_current_set_cpu(&connection->receiver);
c51a0ef3
LE
5056 update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug);
5057 if (drbd_recv_header_maybe_unplug(connection, &pi))
02918be2 5058 goto err_out;
b411b363 5059
deebe195 5060 cmd = &drbd_cmd_handler[pi.cmd];
4a76b161 5061 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
1ec861eb 5062 drbd_err(connection, "Unexpected data packet %s (0x%04x)",
2fcb8f30 5063 cmdname(pi.cmd), pi.cmd);
02918be2 5064 goto err_out;
0b33a916 5065 }
b411b363 5066
e658983a 5067 shs = cmd->pkt_size;
9104d31a
LE
5068 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
5069 shs += sizeof(struct o_qlim);
e658983a 5070 if (pi.size > shs && !cmd->expect_payload) {
1ec861eb 5071 drbd_err(connection, "No payload expected %s l:%d\n",
2fcb8f30 5072 cmdname(pi.cmd), pi.size);
02918be2 5073 goto err_out;
b411b363 5074 }
9104d31a
LE
5075 if (pi.size < shs) {
5076 drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
5077 cmdname(pi.cmd), (int)shs, pi.size);
5078 goto err_out;
5079 }
b411b363 5080
c13f7e1a 5081 if (shs) {
944410e9 5082 update_receiver_timing_details(connection, drbd_recv_all_warn);
bde89a9e 5083 err = drbd_recv_all_warn(connection, pi.data, shs);
a5c31904 5084 if (err)
c13f7e1a 5085 goto err_out;
e2857216 5086 pi.size -= shs;
c13f7e1a
LE
5087 }
5088
944410e9 5089 update_receiver_timing_details(connection, cmd->fn);
bde89a9e 5090 err = cmd->fn(connection, &pi);
4a76b161 5091 if (err) {
1ec861eb 5092 drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
9f5bdc33 5093 cmdname(pi.cmd), err, pi.size);
02918be2 5094 goto err_out;
b411b363
PR
5095 }
5096 }
82bc0194 5097 return;
b411b363 5098
82bc0194 5099 err_out:
bde89a9e 5100 conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
5101}
5102
bde89a9e 5103static void conn_disconnect(struct drbd_connection *connection)
b411b363 5104{
c06ece6b 5105 struct drbd_peer_device *peer_device;
bbeb641c 5106 enum drbd_conns oc;
376694a0 5107 int vnr;
b411b363 5108
bde89a9e 5109 if (connection->cstate == C_STANDALONE)
b411b363 5110 return;
b411b363 5111
545752d5
LE
5112 /* We are about to start the cleanup after connection loss.
5113 * Make sure drbd_make_request knows about that.
5114 * Usually we should be in some network failure state already,
5115 * but just in case we are not, we fix it up here.
5116 */
bde89a9e 5117 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
545752d5 5118
668700b4 5119 /* ack_receiver does not clean up anything. it must not interfere, either */
1c03e520 5120 drbd_thread_stop(&connection->ack_receiver);
668700b4
PR
5121 if (connection->ack_sender) {
5122 destroy_workqueue(connection->ack_sender);
5123 connection->ack_sender = NULL;
5124 }
bde89a9e 5125 drbd_free_sock(connection);
360cc740 5126
c141ebda 5127 rcu_read_lock();
c06ece6b
AG
5128 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5129 struct drbd_device *device = peer_device->device;
b30ab791 5130 kref_get(&device->kref);
c141ebda 5131 rcu_read_unlock();
69a22773 5132 drbd_disconnected(peer_device);
c06ece6b 5133 kref_put(&device->kref, drbd_destroy_device);
c141ebda
PR
5134 rcu_read_lock();
5135 }
5136 rcu_read_unlock();
5137
bde89a9e 5138 if (!list_empty(&connection->current_epoch->list))
1ec861eb 5139 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
12038a3a 5140 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
bde89a9e
AG
5141 atomic_set(&connection->current_epoch->epoch_size, 0);
5142 connection->send.seen_any_write_yet = false;
12038a3a 5143
1ec861eb 5144 drbd_info(connection, "Connection closed\n");
360cc740 5145
bde89a9e
AG
5146 if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
5147 conn_try_outdate_peer_async(connection);
cb703454 5148
0500813f 5149 spin_lock_irq(&connection->resource->req_lock);
bde89a9e 5150 oc = connection->cstate;
bbeb641c 5151 if (oc >= C_UNCONNECTED)
bde89a9e 5152 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
bbeb641c 5153
0500813f 5154 spin_unlock_irq(&connection->resource->req_lock);
360cc740 5155
f3dfa40a 5156 if (oc == C_DISCONNECTING)
bde89a9e 5157 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
360cc740
PR
5158}
5159
69a22773 5160static int drbd_disconnected(struct drbd_peer_device *peer_device)
360cc740 5161{
69a22773 5162 struct drbd_device *device = peer_device->device;
360cc740 5163 unsigned int i;
b411b363 5164
85719573 5165 /* wait for current activity to cease. */
0500813f 5166 spin_lock_irq(&device->resource->req_lock);
b30ab791
AG
5167 _drbd_wait_ee_list_empty(device, &device->active_ee);
5168 _drbd_wait_ee_list_empty(device, &device->sync_ee);
5169 _drbd_wait_ee_list_empty(device, &device->read_ee);
0500813f 5170 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
5171
5172 /* We do not have data structures that would allow us to
5173 * get the rs_pending_cnt down to 0 again.
5174 * * On C_SYNC_TARGET we do not have any data structures describing
5175 * the pending RSDataRequest's we have sent.
5176 * * On C_SYNC_SOURCE there is no data structure that tracks
5177 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
5178 * And no, it is not the sum of the reference counts in the
5179 * resync_LRU. The resync_LRU tracks the whole operation including
5180 * the disk-IO, while the rs_pending_cnt only tracks the blocks
5181 * on the fly. */
b30ab791
AG
5182 drbd_rs_cancel_all(device);
5183 device->rs_total = 0;
5184 device->rs_failed = 0;
5185 atomic_set(&device->rs_pending_cnt, 0);
5186 wake_up(&device->misc_wait);
b411b363 5187
b30ab791 5188 del_timer_sync(&device->resync_timer);
2bccef39 5189 resync_timer_fn(&device->resync_timer);
b411b363 5190
b411b363
PR
5191 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
5192 * w_make_resync_request etc. which may still be on the worker queue
5193 * to be "canceled" */
b5043c5e 5194 drbd_flush_workqueue(&peer_device->connection->sender_work);
b411b363 5195
b30ab791 5196 drbd_finish_peer_reqs(device);
b411b363 5197
d10b4ea3
PR
5198 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
5199 might have issued a work again. The one before drbd_finish_peer_reqs() is
5200 necessary to reclain net_ee in drbd_finish_peer_reqs(). */
b5043c5e 5201 drbd_flush_workqueue(&peer_device->connection->sender_work);
d10b4ea3 5202
08332d73
LE
5203 /* need to do it again, drbd_finish_peer_reqs() may have populated it
5204 * again via drbd_try_clear_on_disk_bm(). */
b30ab791 5205 drbd_rs_cancel_all(device);
b411b363 5206
b30ab791
AG
5207 kfree(device->p_uuid);
5208 device->p_uuid = NULL;
b411b363 5209
b30ab791 5210 if (!drbd_suspended(device))
69a22773 5211 tl_clear(peer_device->connection);
b411b363 5212
b30ab791 5213 drbd_md_sync(device);
b411b363 5214
be115b69
LE
5215 if (get_ldev(device)) {
5216 drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
5217 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5218 put_ldev(device);
5219 }
20ceb2b2 5220
b411b363
PR
5221 /* tcp_close and release of sendpage pages can be deferred. I don't
5222 * want to use SO_LINGER, because apparently it can be deferred for
5223 * more than 20 seconds (longest time I checked).
5224 *
5225 * Actually we don't care for exactly when the network stack does its
5226 * put_page(), but release our reference on these pages right here.
5227 */
b30ab791 5228 i = drbd_free_peer_reqs(device, &device->net_ee);
b411b363 5229 if (i)
d0180171 5230 drbd_info(device, "net_ee not empty, killed %u entries\n", i);
b30ab791 5231 i = atomic_read(&device->pp_in_use_by_net);
435f0740 5232 if (i)
d0180171 5233 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
b30ab791 5234 i = atomic_read(&device->pp_in_use);
b411b363 5235 if (i)
d0180171 5236 drbd_info(device, "pp_in_use = %d, expected 0\n", i);
b411b363 5237
0b0ba1ef
AG
5238 D_ASSERT(device, list_empty(&device->read_ee));
5239 D_ASSERT(device, list_empty(&device->active_ee));
5240 D_ASSERT(device, list_empty(&device->sync_ee));
5241 D_ASSERT(device, list_empty(&device->done_ee));
b411b363 5242
360cc740 5243 return 0;
b411b363
PR
5244}
5245
5246/*
5247 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
5248 * we can agree on is stored in agreed_pro_version.
5249 *
5250 * feature flags and the reserved array should be enough room for future
5251 * enhancements of the handshake protocol, and possible plugins...
5252 *
5253 * for now, they are expected to be zero, but ignored.
5254 */
bde89a9e 5255static int drbd_send_features(struct drbd_connection *connection)
b411b363 5256{
9f5bdc33
AG
5257 struct drbd_socket *sock;
5258 struct p_connection_features *p;
b411b363 5259
bde89a9e
AG
5260 sock = &connection->data;
5261 p = conn_prepare_command(connection, sock);
9f5bdc33 5262 if (!p)
e8d17b01 5263 return -EIO;
b411b363
PR
5264 memset(p, 0, sizeof(*p));
5265 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
5266 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
20c68fde 5267 p->feature_flags = cpu_to_be32(PRO_FEATURES);
bde89a9e 5268 return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
b411b363
PR
5269}
5270
5271/*
5272 * return values:
5273 * 1 yes, we have a valid connection
5274 * 0 oops, did not work out, please try again
5275 * -1 peer talks different language,
5276 * no point in trying again, please go standalone.
5277 */
bde89a9e 5278static int drbd_do_features(struct drbd_connection *connection)
b411b363 5279{
bde89a9e 5280 /* ASSERT current == connection->receiver ... */
e658983a
AG
5281 struct p_connection_features *p;
5282 const int expect = sizeof(struct p_connection_features);
77351055 5283 struct packet_info pi;
a5c31904 5284 int err;
b411b363 5285
bde89a9e 5286 err = drbd_send_features(connection);
e8d17b01 5287 if (err)
b411b363
PR
5288 return 0;
5289
bde89a9e 5290 err = drbd_recv_header(connection, &pi);
69bc7bc3 5291 if (err)
b411b363
PR
5292 return 0;
5293
6038178e 5294 if (pi.cmd != P_CONNECTION_FEATURES) {
1ec861eb 5295 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
2fcb8f30 5296 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5297 return -1;
5298 }
5299
77351055 5300 if (pi.size != expect) {
1ec861eb 5301 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
77351055 5302 expect, pi.size);
b411b363
PR
5303 return -1;
5304 }
5305
e658983a 5306 p = pi.data;
bde89a9e 5307 err = drbd_recv_all_warn(connection, p, expect);
a5c31904 5308 if (err)
b411b363 5309 return 0;
b411b363 5310
b411b363
PR
5311 p->protocol_min = be32_to_cpu(p->protocol_min);
5312 p->protocol_max = be32_to_cpu(p->protocol_max);
5313 if (p->protocol_max == 0)
5314 p->protocol_max = p->protocol_min;
5315
5316 if (PRO_VERSION_MAX < p->protocol_min ||
5317 PRO_VERSION_MIN > p->protocol_max)
5318 goto incompat;
5319
bde89a9e 5320 connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
20c68fde 5321 connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
b411b363 5322
1ec861eb 5323 drbd_info(connection, "Handshake successful: "
bde89a9e 5324 "Agreed network protocol version %d\n", connection->agreed_pro_version);
b411b363 5325
f31e583a 5326 drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n",
9104d31a
LE
5327 connection->agreed_features,
5328 connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5329 connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
f31e583a
LE
5330 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "",
5331 connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" :
9104d31a 5332 connection->agreed_features ? "" : " none");
92d94ae6 5333
b411b363
PR
5334 return 1;
5335
5336 incompat:
1ec861eb 5337 drbd_err(connection, "incompatible DRBD dialects: "
b411b363
PR
5338 "I support %d-%d, peer supports %d-%d\n",
5339 PRO_VERSION_MIN, PRO_VERSION_MAX,
5340 p->protocol_min, p->protocol_max);
5341 return -1;
5342}
5343
5344#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
bde89a9e 5345static int drbd_do_auth(struct drbd_connection *connection)
b411b363 5346{
1ec861eb
AG
5347 drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
5348 drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
b10d96cb 5349 return -1;
b411b363
PR
5350}
5351#else
5352#define CHALLENGE_LEN 64
b10d96cb
JT
5353
5354/* Return value:
5355 1 - auth succeeded,
5356 0 - failed, try again (network error),
5357 -1 - auth failed, don't try again.
5358*/
5359
bde89a9e 5360static int drbd_do_auth(struct drbd_connection *connection)
b411b363 5361{
9f5bdc33 5362 struct drbd_socket *sock;
b411b363 5363 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
b411b363
PR
5364 char *response = NULL;
5365 char *right_response = NULL;
5366 char *peers_ch = NULL;
44ed167d
PR
5367 unsigned int key_len;
5368 char secret[SHARED_SECRET_MAX]; /* 64 byte */
b411b363 5369 unsigned int resp_size;
77ce56e2 5370 struct shash_desc *desc;
77351055 5371 struct packet_info pi;
44ed167d 5372 struct net_conf *nc;
69bc7bc3 5373 int err, rv;
b411b363 5374
9f5bdc33 5375 /* FIXME: Put the challenge/response into the preallocated socket buffer. */
b411b363 5376
44ed167d 5377 rcu_read_lock();
bde89a9e 5378 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
5379 key_len = strlen(nc->shared_secret);
5380 memcpy(secret, nc->shared_secret, key_len);
5381 rcu_read_unlock();
5382
77ce56e2
AB
5383 desc = kmalloc(sizeof(struct shash_desc) +
5384 crypto_shash_descsize(connection->cram_hmac_tfm),
5385 GFP_KERNEL);
5386 if (!desc) {
5387 rv = -1;
5388 goto fail;
5389 }
9534d671 5390 desc->tfm = connection->cram_hmac_tfm;
b411b363 5391
9534d671 5392 rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
b411b363 5393 if (rv) {
9534d671 5394 drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv);
b10d96cb 5395 rv = -1;
b411b363
PR
5396 goto fail;
5397 }
5398
5399 get_random_bytes(my_challenge, CHALLENGE_LEN);
5400
bde89a9e
AG
5401 sock = &connection->data;
5402 if (!conn_prepare_command(connection, sock)) {
9f5bdc33
AG
5403 rv = 0;
5404 goto fail;
5405 }
bde89a9e 5406 rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
9f5bdc33 5407 my_challenge, CHALLENGE_LEN);
b411b363
PR
5408 if (!rv)
5409 goto fail;
5410
bde89a9e 5411 err = drbd_recv_header(connection, &pi);
69bc7bc3
AG
5412 if (err) {
5413 rv = 0;
b411b363 5414 goto fail;
69bc7bc3 5415 }
b411b363 5416
77351055 5417 if (pi.cmd != P_AUTH_CHALLENGE) {
1ec861eb 5418 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
2fcb8f30 5419 cmdname(pi.cmd), pi.cmd);
9049ccd4 5420 rv = -1;
b411b363
PR
5421 goto fail;
5422 }
5423
77351055 5424 if (pi.size > CHALLENGE_LEN * 2) {
1ec861eb 5425 drbd_err(connection, "expected AuthChallenge payload too big.\n");
b10d96cb 5426 rv = -1;
b411b363
PR
5427 goto fail;
5428 }
5429
67cca286
PR
5430 if (pi.size < CHALLENGE_LEN) {
5431 drbd_err(connection, "AuthChallenge payload too small.\n");
5432 rv = -1;
5433 goto fail;
5434 }
5435
77351055 5436 peers_ch = kmalloc(pi.size, GFP_NOIO);
8404e191 5437 if (!peers_ch) {
b10d96cb 5438 rv = -1;
b411b363
PR
5439 goto fail;
5440 }
5441
bde89a9e 5442 err = drbd_recv_all_warn(connection, peers_ch, pi.size);
a5c31904 5443 if (err) {
b411b363
PR
5444 rv = 0;
5445 goto fail;
5446 }
5447
67cca286
PR
5448 if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
5449 drbd_err(connection, "Peer presented the same challenge!\n");
5450 rv = -1;
5451 goto fail;
5452 }
5453
9534d671 5454 resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
b411b363 5455 response = kmalloc(resp_size, GFP_NOIO);
8404e191 5456 if (!response) {
b10d96cb 5457 rv = -1;
b411b363
PR
5458 goto fail;
5459 }
5460
9534d671 5461 rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
b411b363 5462 if (rv) {
1ec861eb 5463 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 5464 rv = -1;
b411b363
PR
5465 goto fail;
5466 }
5467
bde89a9e 5468 if (!conn_prepare_command(connection, sock)) {
9f5bdc33 5469 rv = 0;
b411b363 5470 goto fail;
9f5bdc33 5471 }
bde89a9e 5472 rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
9f5bdc33 5473 response, resp_size);
b411b363
PR
5474 if (!rv)
5475 goto fail;
5476
bde89a9e 5477 err = drbd_recv_header(connection, &pi);
69bc7bc3 5478 if (err) {
b411b363
PR
5479 rv = 0;
5480 goto fail;
5481 }
5482
77351055 5483 if (pi.cmd != P_AUTH_RESPONSE) {
1ec861eb 5484 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
2fcb8f30 5485 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5486 rv = 0;
5487 goto fail;
5488 }
5489
77351055 5490 if (pi.size != resp_size) {
1ec861eb 5491 drbd_err(connection, "expected AuthResponse payload of wrong size\n");
b411b363
PR
5492 rv = 0;
5493 goto fail;
5494 }
b411b363 5495
bde89a9e 5496 err = drbd_recv_all_warn(connection, response , resp_size);
a5c31904 5497 if (err) {
b411b363
PR
5498 rv = 0;
5499 goto fail;
5500 }
5501
5502 right_response = kmalloc(resp_size, GFP_NOIO);
8404e191 5503 if (!right_response) {
b10d96cb 5504 rv = -1;
b411b363
PR
5505 goto fail;
5506 }
5507
9534d671
HX
5508 rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
5509 right_response);
b411b363 5510 if (rv) {
1ec861eb 5511 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 5512 rv = -1;
b411b363
PR
5513 goto fail;
5514 }
5515
5516 rv = !memcmp(response, right_response, resp_size);
5517
5518 if (rv)
1ec861eb 5519 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
44ed167d 5520 resp_size);
b10d96cb
JT
5521 else
5522 rv = -1;
b411b363
PR
5523
5524 fail:
5525 kfree(peers_ch);
5526 kfree(response);
5527 kfree(right_response);
77ce56e2
AB
5528 if (desc) {
5529 shash_desc_zero(desc);
5530 kfree(desc);
5531 }
b411b363
PR
5532
5533 return rv;
5534}
5535#endif
5536
8fe60551 5537int drbd_receiver(struct drbd_thread *thi)
b411b363 5538{
bde89a9e 5539 struct drbd_connection *connection = thi->connection;
b411b363
PR
5540 int h;
5541
1ec861eb 5542 drbd_info(connection, "receiver (re)started\n");
b411b363
PR
5543
5544 do {
bde89a9e 5545 h = conn_connect(connection);
b411b363 5546 if (h == 0) {
bde89a9e 5547 conn_disconnect(connection);
20ee6390 5548 schedule_timeout_interruptible(HZ);
b411b363
PR
5549 }
5550 if (h == -1) {
1ec861eb 5551 drbd_warn(connection, "Discarding network configuration.\n");
bde89a9e 5552 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
5553 }
5554 } while (h == 0);
5555
c51a0ef3
LE
5556 if (h > 0) {
5557 blk_start_plug(&connection->receiver_plug);
bde89a9e 5558 drbdd(connection);
c51a0ef3
LE
5559 blk_finish_plug(&connection->receiver_plug);
5560 }
b411b363 5561
bde89a9e 5562 conn_disconnect(connection);
b411b363 5563
1ec861eb 5564 drbd_info(connection, "receiver terminated\n");
b411b363
PR
5565 return 0;
5566}
5567
5568/* ********* acknowledge sender ******** */
5569
bde89a9e 5570static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5571{
e658983a 5572 struct p_req_state_reply *p = pi->data;
e4f78ede
PR
5573 int retcode = be32_to_cpu(p->retcode);
5574
5575 if (retcode >= SS_SUCCESS) {
bde89a9e 5576 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
e4f78ede 5577 } else {
bde89a9e 5578 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
1ec861eb 5579 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
e4f78ede
PR
5580 drbd_set_st_err_str(retcode), retcode);
5581 }
bde89a9e 5582 wake_up(&connection->ping_wait);
e4f78ede 5583
2735a594 5584 return 0;
e4f78ede 5585}
b411b363 5586
bde89a9e 5587static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5588{
9f4fe9ad 5589 struct drbd_peer_device *peer_device;
b30ab791 5590 struct drbd_device *device;
e658983a 5591 struct p_req_state_reply *p = pi->data;
b411b363
PR
5592 int retcode = be32_to_cpu(p->retcode);
5593
9f4fe9ad
AG
5594 peer_device = conn_peer_device(connection, pi->vnr);
5595 if (!peer_device)
2735a594 5596 return -EIO;
9f4fe9ad 5597 device = peer_device->device;
1952e916 5598
bde89a9e 5599 if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
0b0ba1ef 5600 D_ASSERT(device, connection->agreed_pro_version < 100);
bde89a9e 5601 return got_conn_RqSReply(connection, pi);
4d0fc3fd
PR
5602 }
5603
b411b363 5604 if (retcode >= SS_SUCCESS) {
b30ab791 5605 set_bit(CL_ST_CHG_SUCCESS, &device->flags);
b411b363 5606 } else {
b30ab791 5607 set_bit(CL_ST_CHG_FAIL, &device->flags);
d0180171 5608 drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
e4f78ede 5609 drbd_set_st_err_str(retcode), retcode);
b411b363 5610 }
b30ab791 5611 wake_up(&device->state_wait);
b411b363 5612
2735a594 5613 return 0;
b411b363
PR
5614}
5615
bde89a9e 5616static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5617{
bde89a9e 5618 return drbd_send_ping_ack(connection);
b411b363
PR
5619
5620}
5621
bde89a9e 5622static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363
PR
5623{
5624 /* restore idle timeout */
bde89a9e
AG
5625 connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5626 if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5627 wake_up(&connection->ping_wait);
b411b363 5628
2735a594 5629 return 0;
b411b363
PR
5630}
5631
bde89a9e 5632static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5633{
9f4fe9ad 5634 struct drbd_peer_device *peer_device;
b30ab791 5635 struct drbd_device *device;
e658983a 5636 struct p_block_ack *p = pi->data;
b411b363
PR
5637 sector_t sector = be64_to_cpu(p->sector);
5638 int blksize = be32_to_cpu(p->blksize);
5639
9f4fe9ad
AG
5640 peer_device = conn_peer_device(connection, pi->vnr);
5641 if (!peer_device)
2735a594 5642 return -EIO;
9f4fe9ad 5643 device = peer_device->device;
1952e916 5644
9f4fe9ad 5645 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
b411b363 5646
69a22773 5647 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5648
b30ab791
AG
5649 if (get_ldev(device)) {
5650 drbd_rs_complete_io(device, sector);
5651 drbd_set_in_sync(device, sector, blksize);
1d53f09e 5652 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
b30ab791
AG
5653 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5654 put_ldev(device);
1d53f09e 5655 }
b30ab791
AG
5656 dec_rs_pending(device);
5657 atomic_add(blksize >> 9, &device->rs_sect_in);
b411b363 5658
2735a594 5659 return 0;
b411b363
PR
5660}
5661
bc9c5c41 5662static int
b30ab791 5663validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
bc9c5c41
AG
5664 struct rb_root *root, const char *func,
5665 enum drbd_req_event what, bool missing_ok)
b411b363
PR
5666{
5667 struct drbd_request *req;
5668 struct bio_and_error m;
5669
0500813f 5670 spin_lock_irq(&device->resource->req_lock);
b30ab791 5671 req = find_request(device, root, id, sector, missing_ok, func);
b411b363 5672 if (unlikely(!req)) {
0500813f 5673 spin_unlock_irq(&device->resource->req_lock);
85997675 5674 return -EIO;
b411b363
PR
5675 }
5676 __req_mod(req, what, &m);
0500813f 5677 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
5678
5679 if (m.bio)
b30ab791 5680 complete_master_bio(device, &m);
85997675 5681 return 0;
b411b363
PR
5682}
5683
bde89a9e 5684static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5685{
9f4fe9ad 5686 struct drbd_peer_device *peer_device;
b30ab791 5687 struct drbd_device *device;
e658983a 5688 struct p_block_ack *p = pi->data;
b411b363
PR
5689 sector_t sector = be64_to_cpu(p->sector);
5690 int blksize = be32_to_cpu(p->blksize);
5691 enum drbd_req_event what;
5692
9f4fe9ad
AG
5693 peer_device = conn_peer_device(connection, pi->vnr);
5694 if (!peer_device)
2735a594 5695 return -EIO;
9f4fe9ad 5696 device = peer_device->device;
1952e916 5697
69a22773 5698 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5699
579b57ed 5700 if (p->block_id == ID_SYNCER) {
b30ab791
AG
5701 drbd_set_in_sync(device, sector, blksize);
5702 dec_rs_pending(device);
2735a594 5703 return 0;
b411b363 5704 }
e05e1e59 5705 switch (pi->cmd) {
b411b363 5706 case P_RS_WRITE_ACK:
8554df1c 5707 what = WRITE_ACKED_BY_PEER_AND_SIS;
b411b363
PR
5708 break;
5709 case P_WRITE_ACK:
8554df1c 5710 what = WRITE_ACKED_BY_PEER;
b411b363
PR
5711 break;
5712 case P_RECV_ACK:
8554df1c 5713 what = RECV_ACKED_BY_PEER;
b411b363 5714 break;
d4dabbe2
LE
5715 case P_SUPERSEDED:
5716 what = CONFLICT_RESOLVED;
b411b363 5717 break;
7be8da07 5718 case P_RETRY_WRITE:
7be8da07 5719 what = POSTPONE_WRITE;
b411b363
PR
5720 break;
5721 default:
2735a594 5722 BUG();
b411b363
PR
5723 }
5724
b30ab791
AG
5725 return validate_req_change_req_state(device, p->block_id, sector,
5726 &device->write_requests, __func__,
2735a594 5727 what, false);
b411b363
PR
5728}
5729
bde89a9e 5730static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5731{
9f4fe9ad 5732 struct drbd_peer_device *peer_device;
b30ab791 5733 struct drbd_device *device;
e658983a 5734 struct p_block_ack *p = pi->data;
b411b363 5735 sector_t sector = be64_to_cpu(p->sector);
2deb8336 5736 int size = be32_to_cpu(p->blksize);
85997675 5737 int err;
b411b363 5738
9f4fe9ad
AG
5739 peer_device = conn_peer_device(connection, pi->vnr);
5740 if (!peer_device)
2735a594 5741 return -EIO;
9f4fe9ad 5742 device = peer_device->device;
b411b363 5743
69a22773 5744 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5745
579b57ed 5746 if (p->block_id == ID_SYNCER) {
b30ab791
AG
5747 dec_rs_pending(device);
5748 drbd_rs_failed_io(device, sector, size);
2735a594 5749 return 0;
b411b363 5750 }
2deb8336 5751
b30ab791
AG
5752 err = validate_req_change_req_state(device, p->block_id, sector,
5753 &device->write_requests, __func__,
303d1448 5754 NEG_ACKED, true);
85997675 5755 if (err) {
c3afd8f5
AG
5756 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5757 The master bio might already be completed, therefore the
5758 request is no longer in the collision hash. */
5759 /* In Protocol B we might already have got a P_RECV_ACK
5760 but then get a P_NEG_ACK afterwards. */
b30ab791 5761 drbd_set_out_of_sync(device, sector, size);
2deb8336 5762 }
2735a594 5763 return 0;
b411b363
PR
5764}
5765
bde89a9e 5766static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5767{
9f4fe9ad 5768 struct drbd_peer_device *peer_device;
b30ab791 5769 struct drbd_device *device;
e658983a 5770 struct p_block_ack *p = pi->data;
b411b363
PR
5771 sector_t sector = be64_to_cpu(p->sector);
5772
9f4fe9ad
AG
5773 peer_device = conn_peer_device(connection, pi->vnr);
5774 if (!peer_device)
2735a594 5775 return -EIO;
9f4fe9ad 5776 device = peer_device->device;
1952e916 5777
69a22773 5778 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
7be8da07 5779
d0180171 5780 drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
b411b363
PR
5781 (unsigned long long)sector, be32_to_cpu(p->blksize));
5782
b30ab791
AG
5783 return validate_req_change_req_state(device, p->block_id, sector,
5784 &device->read_requests, __func__,
2735a594 5785 NEG_ACKED, false);
b411b363
PR
5786}
5787
bde89a9e 5788static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5789{
9f4fe9ad 5790 struct drbd_peer_device *peer_device;
b30ab791 5791 struct drbd_device *device;
b411b363
PR
5792 sector_t sector;
5793 int size;
e658983a 5794 struct p_block_ack *p = pi->data;
1952e916 5795
9f4fe9ad
AG
5796 peer_device = conn_peer_device(connection, pi->vnr);
5797 if (!peer_device)
2735a594 5798 return -EIO;
9f4fe9ad 5799 device = peer_device->device;
b411b363
PR
5800
5801 sector = be64_to_cpu(p->sector);
5802 size = be32_to_cpu(p->blksize);
b411b363 5803
69a22773 5804 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5805
b30ab791 5806 dec_rs_pending(device);
b411b363 5807
b30ab791
AG
5808 if (get_ldev_if_state(device, D_FAILED)) {
5809 drbd_rs_complete_io(device, sector);
e05e1e59 5810 switch (pi->cmd) {
d612d309 5811 case P_NEG_RS_DREPLY:
b30ab791 5812 drbd_rs_failed_io(device, sector, size);
6327c911 5813 break;
d612d309
PR
5814 case P_RS_CANCEL:
5815 break;
5816 default:
2735a594 5817 BUG();
d612d309 5818 }
b30ab791 5819 put_ldev(device);
b411b363
PR
5820 }
5821
2735a594 5822 return 0;
b411b363
PR
5823}
5824
bde89a9e 5825static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5826{
e658983a 5827 struct p_barrier_ack *p = pi->data;
c06ece6b 5828 struct drbd_peer_device *peer_device;
9ed57dcb 5829 int vnr;
1952e916 5830
bde89a9e 5831 tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
b411b363 5832
9ed57dcb 5833 rcu_read_lock();
c06ece6b
AG
5834 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5835 struct drbd_device *device = peer_device->device;
5836
b30ab791
AG
5837 if (device->state.conn == C_AHEAD &&
5838 atomic_read(&device->ap_in_flight) == 0 &&
5839 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5840 device->start_resync_timer.expires = jiffies + HZ;
5841 add_timer(&device->start_resync_timer);
9ed57dcb 5842 }
c4752ef1 5843 }
9ed57dcb 5844 rcu_read_unlock();
c4752ef1 5845
2735a594 5846 return 0;
b411b363
PR
5847}
5848
bde89a9e 5849static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5850{
9f4fe9ad 5851 struct drbd_peer_device *peer_device;
b30ab791 5852 struct drbd_device *device;
e658983a 5853 struct p_block_ack *p = pi->data;
84b8c06b 5854 struct drbd_device_work *dw;
b411b363
PR
5855 sector_t sector;
5856 int size;
5857
9f4fe9ad
AG
5858 peer_device = conn_peer_device(connection, pi->vnr);
5859 if (!peer_device)
2735a594 5860 return -EIO;
9f4fe9ad 5861 device = peer_device->device;
1952e916 5862
b411b363
PR
5863 sector = be64_to_cpu(p->sector);
5864 size = be32_to_cpu(p->blksize);
5865
69a22773 5866 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363
PR
5867
5868 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
b30ab791 5869 drbd_ov_out_of_sync_found(device, sector, size);
b411b363 5870 else
b30ab791 5871 ov_out_of_sync_print(device);
b411b363 5872
b30ab791 5873 if (!get_ldev(device))
2735a594 5874 return 0;
1d53f09e 5875
b30ab791
AG
5876 drbd_rs_complete_io(device, sector);
5877 dec_rs_pending(device);
b411b363 5878
b30ab791 5879 --device->ov_left;
ea5442af
LE
5880
5881 /* let's advance progress step marks only for every other megabyte */
b30ab791
AG
5882 if ((device->ov_left & 0x200) == 0x200)
5883 drbd_advance_rs_marks(device, device->ov_left);
ea5442af 5884
b30ab791 5885 if (device->ov_left == 0) {
84b8c06b
AG
5886 dw = kmalloc(sizeof(*dw), GFP_NOIO);
5887 if (dw) {
5888 dw->w.cb = w_ov_finished;
5889 dw->device = device;
5890 drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
b411b363 5891 } else {
84b8c06b 5892 drbd_err(device, "kmalloc(dw) failed.");
b30ab791
AG
5893 ov_out_of_sync_print(device);
5894 drbd_resync_finished(device);
b411b363
PR
5895 }
5896 }
b30ab791 5897 put_ldev(device);
2735a594 5898 return 0;
b411b363
PR
5899}
5900
bde89a9e 5901static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
0ced55a3 5902{
2735a594 5903 return 0;
b411b363
PR
5904}
5905
668700b4
PR
5906struct meta_sock_cmd {
5907 size_t pkt_size;
5908 int (*fn)(struct drbd_connection *connection, struct packet_info *);
5909};
5910
5911static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
0ced55a3 5912{
668700b4
PR
5913 long t;
5914 struct net_conf *nc;
32862ec7 5915
668700b4
PR
5916 rcu_read_lock();
5917 nc = rcu_dereference(connection->net_conf);
5918 t = ping_timeout ? nc->ping_timeo : nc->ping_int;
5919 rcu_read_unlock();
c141ebda 5920
668700b4
PR
5921 t *= HZ;
5922 if (ping_timeout)
5923 t /= 10;
082a3439 5924
668700b4
PR
5925 connection->meta.socket->sk->sk_rcvtimeo = t;
5926}
32862ec7 5927
668700b4
PR
5928static void set_ping_timeout(struct drbd_connection *connection)
5929{
5930 set_rcvtimeo(connection, 1);
0ced55a3
PR
5931}
5932
668700b4
PR
5933static void set_idle_timeout(struct drbd_connection *connection)
5934{
5935 set_rcvtimeo(connection, 0);
5936}
b411b363 5937
668700b4 5938static struct meta_sock_cmd ack_receiver_tbl[] = {
e658983a
AG
5939 [P_PING] = { 0, got_Ping },
5940 [P_PING_ACK] = { 0, got_PingAck },
b411b363
PR
5941 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5942 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5943 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
d4dabbe2 5944 [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck },
b411b363
PR
5945 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
5946 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
1952e916 5947 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
b411b363
PR
5948 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
5949 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
5950 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5951 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
02918be2 5952 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
1952e916
AG
5953 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
5954 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5955 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
7201b972 5956};
b411b363 5957
1c03e520 5958int drbd_ack_receiver(struct drbd_thread *thi)
b411b363 5959{
bde89a9e 5960 struct drbd_connection *connection = thi->connection;
668700b4 5961 struct meta_sock_cmd *cmd = NULL;
77351055 5962 struct packet_info pi;
668700b4 5963 unsigned long pre_recv_jif;
257d0af6 5964 int rv;
bde89a9e 5965 void *buf = connection->meta.rbuf;
b411b363 5966 int received = 0;
bde89a9e 5967 unsigned int header_size = drbd_header_size(connection);
52b061a4 5968 int expect = header_size;
44ed167d 5969 bool ping_timeout_active = false;
b411b363 5970
8b700983 5971 sched_set_fifo_low(current);
b411b363 5972
e77a0a5c 5973 while (get_t_state(thi) == RUNNING) {
80822284 5974 drbd_thread_current_set_cpu(thi);
b411b363 5975
668700b4 5976 conn_reclaim_net_peer_reqs(connection);
44ed167d 5977
bde89a9e
AG
5978 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5979 if (drbd_send_ping(connection)) {
1ec861eb 5980 drbd_err(connection, "drbd_send_ping has failed\n");
b411b363 5981 goto reconnect;
841ce241 5982 }
668700b4 5983 set_ping_timeout(connection);
44ed167d 5984 ping_timeout_active = true;
b411b363
PR
5985 }
5986
668700b4 5987 pre_recv_jif = jiffies;
bde89a9e 5988 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
b411b363
PR
5989
5990 /* Note:
5991 * -EINTR (on meta) we got a signal
5992 * -EAGAIN (on meta) rcvtimeo expired
5993 * -ECONNRESET other side closed the connection
5994 * -ERESTARTSYS (on data) we got a signal
5995 * rv < 0 other than above: unexpected error!
5996 * rv == expected: full header or command
5997 * rv < expected: "woken" by signal during receive
5998 * rv == 0 : "connection shut down by peer"
5999 */
6000 if (likely(rv > 0)) {
6001 received += rv;
6002 buf += rv;
6003 } else if (rv == 0) {
bde89a9e 6004 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
b66623e3
PR
6005 long t;
6006 rcu_read_lock();
bde89a9e 6007 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
b66623e3
PR
6008 rcu_read_unlock();
6009
bde89a9e
AG
6010 t = wait_event_timeout(connection->ping_wait,
6011 connection->cstate < C_WF_REPORT_PARAMS,
b66623e3 6012 t);
599377ac
PR
6013 if (t)
6014 break;
6015 }
1ec861eb 6016 drbd_err(connection, "meta connection shut down by peer.\n");
b411b363
PR
6017 goto reconnect;
6018 } else if (rv == -EAGAIN) {
cb6518cb
LE
6019 /* If the data socket received something meanwhile,
6020 * that is good enough: peer is still alive. */
668700b4 6021 if (time_after(connection->last_received, pre_recv_jif))
cb6518cb 6022 continue;
f36af18c 6023 if (ping_timeout_active) {
1ec861eb 6024 drbd_err(connection, "PingAck did not arrive in time.\n");
b411b363
PR
6025 goto reconnect;
6026 }
bde89a9e 6027 set_bit(SEND_PING, &connection->flags);
b411b363
PR
6028 continue;
6029 } else if (rv == -EINTR) {
668700b4
PR
6030 /* maybe drbd_thread_stop(): the while condition will notice.
6031 * maybe woken for send_ping: we'll send a ping above,
6032 * and change the rcvtimeo */
6033 flush_signals(current);
b411b363
PR
6034 continue;
6035 } else {
1ec861eb 6036 drbd_err(connection, "sock_recvmsg returned %d\n", rv);
b411b363
PR
6037 goto reconnect;
6038 }
6039
6040 if (received == expect && cmd == NULL) {
bde89a9e 6041 if (decode_header(connection, connection->meta.rbuf, &pi))
b411b363 6042 goto reconnect;
668700b4
PR
6043 cmd = &ack_receiver_tbl[pi.cmd];
6044 if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
1ec861eb 6045 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
2fcb8f30 6046 cmdname(pi.cmd), pi.cmd);
b411b363
PR
6047 goto disconnect;
6048 }
e658983a 6049 expect = header_size + cmd->pkt_size;
52b061a4 6050 if (pi.size != expect - header_size) {
1ec861eb 6051 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
77351055 6052 pi.cmd, pi.size);
b411b363 6053 goto reconnect;
257d0af6 6054 }
b411b363
PR
6055 }
6056 if (received == expect) {
2735a594 6057 bool err;
a4fbda8e 6058
bde89a9e 6059 err = cmd->fn(connection, &pi);
2735a594 6060 if (err) {
d75f773c 6061 drbd_err(connection, "%ps failed\n", cmd->fn);
b411b363 6062 goto reconnect;
1952e916 6063 }
b411b363 6064
bde89a9e 6065 connection->last_received = jiffies;
f36af18c 6066
668700b4
PR
6067 if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
6068 set_idle_timeout(connection);
44ed167d
PR
6069 ping_timeout_active = false;
6070 }
f36af18c 6071
bde89a9e 6072 buf = connection->meta.rbuf;
b411b363 6073 received = 0;
52b061a4 6074 expect = header_size;
b411b363
PR
6075 cmd = NULL;
6076 }
6077 }
6078
6079 if (0) {
6080reconnect:
bde89a9e
AG
6081 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6082 conn_md_sync(connection);
b411b363
PR
6083 }
6084 if (0) {
6085disconnect:
bde89a9e 6086 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 6087 }
b411b363 6088
668700b4 6089 drbd_info(connection, "ack_receiver terminated\n");
b411b363
PR
6090
6091 return 0;
6092}
668700b4
PR
6093
6094void drbd_send_acks_wf(struct work_struct *ws)
6095{
6096 struct drbd_peer_device *peer_device =
6097 container_of(ws, struct drbd_peer_device, send_acks_work);
6098 struct drbd_connection *connection = peer_device->connection;
6099 struct drbd_device *device = peer_device->device;
6100 struct net_conf *nc;
6101 int tcp_cork, err;
6102
6103 rcu_read_lock();
6104 nc = rcu_dereference(connection->net_conf);
6105 tcp_cork = nc->tcp_cork;
6106 rcu_read_unlock();
6107
6108 if (tcp_cork)
db10538a 6109 tcp_sock_set_cork(connection->meta.socket->sk, true);
668700b4
PR
6110
6111 err = drbd_finish_peer_reqs(device);
6112 kref_put(&device->kref, drbd_destroy_device);
6113 /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
6114 struct work_struct send_acks_work alive, which is in the peer_device object */
6115
6116 if (err) {
6117 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6118 return;
6119 }
6120
6121 if (tcp_cork)
db10538a 6122 tcp_sock_set_cork(connection->meta.socket->sk, false);
668700b4
PR
6123
6124 return;
6125}