drm: v3d: Switch to use drm_gem_object reservation_object
[linux-2.6-block.git] / drivers / block / drbd / drbd_receiver.c
CommitLineData
b411b363
PR
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
b411b363
PR
26#include <linux/module.h>
27
7e5fec31 28#include <linux/uaccess.h>
b411b363
PR
29#include <net/sock.h>
30
b411b363
PR
31#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
ae7e81c0 39#include <uapi/linux/sched/types.h>
174cd4b1 40#include <linux/sched/signal.h>
b411b363
PR
41#include <linux/pkt_sched.h>
42#define __KERNEL_SYSCALLS__
43#include <linux/unistd.h>
44#include <linux/vmalloc.h>
45#include <linux/random.h>
b411b363
PR
46#include <linux/string.h>
47#include <linux/scatterlist.h>
48#include "drbd_int.h"
a3603a6e 49#include "drbd_protocol.h"
b411b363 50#include "drbd_req.h"
b411b363
PR
51#include "drbd_vli.h"
52
f31e583a 53#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES)
20c68fde 54
77351055
PR
55struct packet_info {
56 enum drbd_packet cmd;
e2857216
AG
57 unsigned int size;
58 unsigned int vnr;
e658983a 59 void *data;
77351055
PR
60};
61
b411b363
PR
62enum finish_epoch {
63 FE_STILL_LIVE,
64 FE_DESTROYED,
65 FE_RECYCLED,
66};
67
bde89a9e
AG
68static int drbd_do_features(struct drbd_connection *connection);
69static int drbd_do_auth(struct drbd_connection *connection);
69a22773 70static int drbd_disconnected(struct drbd_peer_device *);
a0fb3c47 71static void conn_wait_active_ee_empty(struct drbd_connection *connection);
bde89a9e 72static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
99920dc5 73static int e_end_block(struct drbd_work *, int);
b411b363 74
b411b363
PR
75
76#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
77
45bb912b
LE
78/*
79 * some helper functions to deal with single linked page lists,
80 * page->private being our "next" pointer.
81 */
82
83/* If at least n pages are linked at head, get n pages off.
84 * Otherwise, don't modify head, and return NULL.
85 * Locking is the responsibility of the caller.
86 */
87static struct page *page_chain_del(struct page **head, int n)
88{
89 struct page *page;
90 struct page *tmp;
91
92 BUG_ON(!n);
93 BUG_ON(!head);
94
95 page = *head;
23ce4227
PR
96
97 if (!page)
98 return NULL;
99
45bb912b
LE
100 while (page) {
101 tmp = page_chain_next(page);
102 if (--n == 0)
103 break; /* found sufficient pages */
104 if (tmp == NULL)
105 /* insufficient pages, don't use any of them. */
106 return NULL;
107 page = tmp;
108 }
109
110 /* add end of list marker for the returned list */
111 set_page_private(page, 0);
112 /* actual return value, and adjustment of head */
113 page = *head;
114 *head = tmp;
115 return page;
116}
117
118/* may be used outside of locks to find the tail of a (usually short)
119 * "private" page chain, before adding it back to a global chain head
120 * with page_chain_add() under a spinlock. */
121static struct page *page_chain_tail(struct page *page, int *len)
122{
123 struct page *tmp;
124 int i = 1;
125 while ((tmp = page_chain_next(page)))
126 ++i, page = tmp;
127 if (len)
128 *len = i;
129 return page;
130}
131
132static int page_chain_free(struct page *page)
133{
134 struct page *tmp;
135 int i = 0;
136 page_chain_for_each_safe(page, tmp) {
137 put_page(page);
138 ++i;
139 }
140 return i;
141}
142
143static void page_chain_add(struct page **head,
144 struct page *chain_first, struct page *chain_last)
145{
146#if 1
147 struct page *tmp;
148 tmp = page_chain_tail(chain_first, NULL);
149 BUG_ON(tmp != chain_last);
150#endif
151
152 /* add chain to head */
153 set_page_private(chain_last, (unsigned long)*head);
154 *head = chain_first;
155}
156
b30ab791 157static struct page *__drbd_alloc_pages(struct drbd_device *device,
18c2d522 158 unsigned int number)
b411b363
PR
159{
160 struct page *page = NULL;
45bb912b 161 struct page *tmp = NULL;
18c2d522 162 unsigned int i = 0;
b411b363
PR
163
164 /* Yes, testing drbd_pp_vacant outside the lock is racy.
165 * So what. It saves a spin_lock. */
45bb912b 166 if (drbd_pp_vacant >= number) {
b411b363 167 spin_lock(&drbd_pp_lock);
45bb912b
LE
168 page = page_chain_del(&drbd_pp_pool, number);
169 if (page)
170 drbd_pp_vacant -= number;
b411b363 171 spin_unlock(&drbd_pp_lock);
45bb912b
LE
172 if (page)
173 return page;
b411b363 174 }
45bb912b 175
b411b363
PR
176 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
177 * "criss-cross" setup, that might cause write-out on some other DRBD,
178 * which in turn might block on the other node at this very place. */
45bb912b
LE
179 for (i = 0; i < number; i++) {
180 tmp = alloc_page(GFP_TRY);
181 if (!tmp)
182 break;
183 set_page_private(tmp, (unsigned long)page);
184 page = tmp;
185 }
186
187 if (i == number)
188 return page;
189
190 /* Not enough pages immediately available this time.
c37c8ecf 191 * No need to jump around here, drbd_alloc_pages will retry this
45bb912b
LE
192 * function "soon". */
193 if (page) {
194 tmp = page_chain_tail(page, NULL);
195 spin_lock(&drbd_pp_lock);
196 page_chain_add(&drbd_pp_pool, page, tmp);
197 drbd_pp_vacant += i;
198 spin_unlock(&drbd_pp_lock);
199 }
200 return NULL;
b411b363
PR
201}
202
b30ab791 203static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
a990be46 204 struct list_head *to_be_freed)
b411b363 205{
a8cd15ba 206 struct drbd_peer_request *peer_req, *tmp;
b411b363
PR
207
208 /* The EEs are always appended to the end of the list. Since
209 they are sent in order over the wire, they have to finish
210 in order. As soon as we see the first not finished we can
211 stop to examine the list... */
212
a8cd15ba 213 list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
045417f7 214 if (drbd_peer_req_has_active_page(peer_req))
b411b363 215 break;
a8cd15ba 216 list_move(&peer_req->w.list, to_be_freed);
b411b363
PR
217 }
218}
219
668700b4 220static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
b411b363
PR
221{
222 LIST_HEAD(reclaimed);
db830c46 223 struct drbd_peer_request *peer_req, *t;
b411b363 224
0500813f 225 spin_lock_irq(&device->resource->req_lock);
b30ab791 226 reclaim_finished_net_peer_reqs(device, &reclaimed);
0500813f 227 spin_unlock_irq(&device->resource->req_lock);
a8cd15ba 228 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
b30ab791 229 drbd_free_net_peer_req(device, peer_req);
b411b363
PR
230}
231
668700b4
PR
232static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
233{
234 struct drbd_peer_device *peer_device;
235 int vnr;
236
237 rcu_read_lock();
238 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
239 struct drbd_device *device = peer_device->device;
240 if (!atomic_read(&device->pp_in_use_by_net))
241 continue;
242
243 kref_get(&device->kref);
244 rcu_read_unlock();
245 drbd_reclaim_net_peer_reqs(device);
246 kref_put(&device->kref, drbd_destroy_device);
247 rcu_read_lock();
248 }
249 rcu_read_unlock();
250}
251
b411b363 252/**
c37c8ecf 253 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
b30ab791 254 * @device: DRBD device.
45bb912b
LE
255 * @number: number of pages requested
256 * @retry: whether to retry, if not enough pages are available right now
257 *
258 * Tries to allocate number pages, first from our own page pool, then from
0e49d7b0 259 * the kernel.
45bb912b 260 * Possibly retry until DRBD frees sufficient pages somewhere else.
b411b363 261 *
0e49d7b0
LE
262 * If this allocation would exceed the max_buffers setting, we throttle
263 * allocation (schedule_timeout) to give the system some room to breathe.
264 *
265 * We do not use max-buffers as hard limit, because it could lead to
266 * congestion and further to a distributed deadlock during online-verify or
267 * (checksum based) resync, if the max-buffers, socket buffer sizes and
268 * resync-rate settings are mis-configured.
269 *
45bb912b 270 * Returns a page chain linked via page->private.
b411b363 271 */
69a22773 272struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
c37c8ecf 273 bool retry)
b411b363 274{
69a22773 275 struct drbd_device *device = peer_device->device;
b411b363 276 struct page *page = NULL;
44ed167d 277 struct net_conf *nc;
b411b363 278 DEFINE_WAIT(wait);
0e49d7b0 279 unsigned int mxb;
b411b363 280
44ed167d 281 rcu_read_lock();
69a22773 282 nc = rcu_dereference(peer_device->connection->net_conf);
44ed167d
PR
283 mxb = nc ? nc->max_buffers : 1000000;
284 rcu_read_unlock();
285
b30ab791
AG
286 if (atomic_read(&device->pp_in_use) < mxb)
287 page = __drbd_alloc_pages(device, number);
b411b363 288
668700b4
PR
289 /* Try to keep the fast path fast, but occasionally we need
290 * to reclaim the pages we lended to the network stack. */
291 if (page && atomic_read(&device->pp_in_use_by_net) > 512)
292 drbd_reclaim_net_peer_reqs(device);
293
45bb912b 294 while (page == NULL) {
b411b363
PR
295 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
296
668700b4 297 drbd_reclaim_net_peer_reqs(device);
b411b363 298
b30ab791
AG
299 if (atomic_read(&device->pp_in_use) < mxb) {
300 page = __drbd_alloc_pages(device, number);
b411b363
PR
301 if (page)
302 break;
303 }
304
305 if (!retry)
306 break;
307
308 if (signal_pending(current)) {
d0180171 309 drbd_warn(device, "drbd_alloc_pages interrupted!\n");
b411b363
PR
310 break;
311 }
312
0e49d7b0
LE
313 if (schedule_timeout(HZ/10) == 0)
314 mxb = UINT_MAX;
b411b363
PR
315 }
316 finish_wait(&drbd_pp_wait, &wait);
317
45bb912b 318 if (page)
b30ab791 319 atomic_add(number, &device->pp_in_use);
b411b363
PR
320 return page;
321}
322
c37c8ecf 323/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
0500813f 324 * Is also used from inside an other spin_lock_irq(&resource->req_lock);
45bb912b
LE
325 * Either links the page chain back to the global pool,
326 * or returns all pages to the system. */
b30ab791 327static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
b411b363 328{
b30ab791 329 atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
b411b363 330 int i;
435f0740 331
a73ff323
LE
332 if (page == NULL)
333 return;
334
183ece30 335 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
45bb912b
LE
336 i = page_chain_free(page);
337 else {
338 struct page *tmp;
339 tmp = page_chain_tail(page, &i);
340 spin_lock(&drbd_pp_lock);
341 page_chain_add(&drbd_pp_pool, page, tmp);
342 drbd_pp_vacant += i;
343 spin_unlock(&drbd_pp_lock);
b411b363 344 }
435f0740 345 i = atomic_sub_return(i, a);
45bb912b 346 if (i < 0)
d0180171 347 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
435f0740 348 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
b411b363
PR
349 wake_up(&drbd_pp_wait);
350}
351
352/*
353You need to hold the req_lock:
354 _drbd_wait_ee_list_empty()
355
356You must not have the req_lock:
3967deb1 357 drbd_free_peer_req()
0db55363 358 drbd_alloc_peer_req()
7721f567 359 drbd_free_peer_reqs()
b411b363 360 drbd_ee_fix_bhs()
a990be46 361 drbd_finish_peer_reqs()
b411b363
PR
362 drbd_clear_done_ee()
363 drbd_wait_ee_list_empty()
364*/
365
9104d31a
LE
366/* normal: payload_size == request size (bi_size)
367 * w_same: payload_size == logical_block_size
368 * trim: payload_size == 0 */
f6ffca9f 369struct drbd_peer_request *
69a22773 370drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
9104d31a 371 unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
b411b363 372{
69a22773 373 struct drbd_device *device = peer_device->device;
db830c46 374 struct drbd_peer_request *peer_req;
a73ff323 375 struct page *page = NULL;
9104d31a 376 unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
b411b363 377
b30ab791 378 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
b411b363
PR
379 return NULL;
380
0892fac8 381 peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
db830c46 382 if (!peer_req) {
b411b363 383 if (!(gfp_mask & __GFP_NOWARN))
d0180171 384 drbd_err(device, "%s: allocation failed\n", __func__);
b411b363
PR
385 return NULL;
386 }
387
9104d31a 388 if (nr_pages) {
d0164adc
MG
389 page = drbd_alloc_pages(peer_device, nr_pages,
390 gfpflags_allow_blocking(gfp_mask));
a73ff323
LE
391 if (!page)
392 goto fail;
393 }
b411b363 394
c5a2c150
LE
395 memset(peer_req, 0, sizeof(*peer_req));
396 INIT_LIST_HEAD(&peer_req->w.list);
db830c46 397 drbd_clear_interval(&peer_req->i);
9104d31a 398 peer_req->i.size = request_size;
db830c46 399 peer_req->i.sector = sector;
c5a2c150 400 peer_req->submit_jif = jiffies;
a8cd15ba 401 peer_req->peer_device = peer_device;
db830c46 402 peer_req->pages = page;
9a8e7753
AG
403 /*
404 * The block_id is opaque to the receiver. It is not endianness
405 * converted, and sent back to the sender unchanged.
406 */
db830c46 407 peer_req->block_id = id;
b411b363 408
db830c46 409 return peer_req;
b411b363 410
45bb912b 411 fail:
0892fac8 412 mempool_free(peer_req, &drbd_ee_mempool);
b411b363
PR
413 return NULL;
414}
415
b30ab791 416void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
f6ffca9f 417 int is_net)
b411b363 418{
21ae5d7f 419 might_sleep();
db830c46
AG
420 if (peer_req->flags & EE_HAS_DIGEST)
421 kfree(peer_req->digest);
b30ab791 422 drbd_free_pages(device, peer_req->pages, is_net);
0b0ba1ef
AG
423 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
424 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
21ae5d7f
LE
425 if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
426 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
427 drbd_al_complete_io(device, &peer_req->i);
428 }
0892fac8 429 mempool_free(peer_req, &drbd_ee_mempool);
b411b363
PR
430}
431
b30ab791 432int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
b411b363
PR
433{
434 LIST_HEAD(work_list);
db830c46 435 struct drbd_peer_request *peer_req, *t;
b411b363 436 int count = 0;
b30ab791 437 int is_net = list == &device->net_ee;
b411b363 438
0500813f 439 spin_lock_irq(&device->resource->req_lock);
b411b363 440 list_splice_init(list, &work_list);
0500813f 441 spin_unlock_irq(&device->resource->req_lock);
b411b363 442
a8cd15ba 443 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
b30ab791 444 __drbd_free_peer_req(device, peer_req, is_net);
b411b363
PR
445 count++;
446 }
447 return count;
448}
449
b411b363 450/*
a990be46 451 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
b411b363 452 */
b30ab791 453static int drbd_finish_peer_reqs(struct drbd_device *device)
b411b363
PR
454{
455 LIST_HEAD(work_list);
456 LIST_HEAD(reclaimed);
db830c46 457 struct drbd_peer_request *peer_req, *t;
e2b3032b 458 int err = 0;
b411b363 459
0500813f 460 spin_lock_irq(&device->resource->req_lock);
b30ab791
AG
461 reclaim_finished_net_peer_reqs(device, &reclaimed);
462 list_splice_init(&device->done_ee, &work_list);
0500813f 463 spin_unlock_irq(&device->resource->req_lock);
b411b363 464
a8cd15ba 465 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
b30ab791 466 drbd_free_net_peer_req(device, peer_req);
b411b363
PR
467
468 /* possible callbacks here:
d4dabbe2 469 * e_end_block, and e_end_resync_block, e_send_superseded.
b411b363
PR
470 * all ignore the last argument.
471 */
a8cd15ba 472 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
e2b3032b
AG
473 int err2;
474
b411b363 475 /* list_del not necessary, next/prev members not touched */
a8cd15ba 476 err2 = peer_req->w.cb(&peer_req->w, !!err);
e2b3032b
AG
477 if (!err)
478 err = err2;
b30ab791 479 drbd_free_peer_req(device, peer_req);
b411b363 480 }
b30ab791 481 wake_up(&device->ee_wait);
b411b363 482
e2b3032b 483 return err;
b411b363
PR
484}
485
b30ab791 486static void _drbd_wait_ee_list_empty(struct drbd_device *device,
d4da1537 487 struct list_head *head)
b411b363
PR
488{
489 DEFINE_WAIT(wait);
490
491 /* avoids spin_lock/unlock
492 * and calling prepare_to_wait in the fast path */
493 while (!list_empty(head)) {
b30ab791 494 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
0500813f 495 spin_unlock_irq(&device->resource->req_lock);
7eaceacc 496 io_schedule();
b30ab791 497 finish_wait(&device->ee_wait, &wait);
0500813f 498 spin_lock_irq(&device->resource->req_lock);
b411b363
PR
499 }
500}
501
b30ab791 502static void drbd_wait_ee_list_empty(struct drbd_device *device,
d4da1537 503 struct list_head *head)
b411b363 504{
0500813f 505 spin_lock_irq(&device->resource->req_lock);
b30ab791 506 _drbd_wait_ee_list_empty(device, head);
0500813f 507 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
508}
509
dbd9eea0 510static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
b411b363 511{
b411b363
PR
512 struct kvec iov = {
513 .iov_base = buf,
514 .iov_len = size,
515 };
516 struct msghdr msg = {
b411b363
PR
517 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
518 };
aa563d7b 519 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size);
f7765c36 520 return sock_recvmsg(sock, &msg, msg.msg_flags);
b411b363
PR
521}
522
bde89a9e 523static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
b411b363 524{
b411b363
PR
525 int rv;
526
bde89a9e 527 rv = drbd_recv_short(connection->data.socket, buf, size, 0);
b411b363 528
dbd0820c
PR
529 if (rv < 0) {
530 if (rv == -ECONNRESET)
1ec861eb 531 drbd_info(connection, "sock was reset by peer\n");
dbd0820c 532 else if (rv != -ERESTARTSYS)
1ec861eb 533 drbd_err(connection, "sock_recvmsg returned %d\n", rv);
dbd0820c 534 } else if (rv == 0) {
bde89a9e 535 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
b66623e3
PR
536 long t;
537 rcu_read_lock();
bde89a9e 538 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
b66623e3
PR
539 rcu_read_unlock();
540
bde89a9e 541 t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
b66623e3 542
599377ac
PR
543 if (t)
544 goto out;
545 }
1ec861eb 546 drbd_info(connection, "sock was shut down by peer\n");
599377ac
PR
547 }
548
b411b363 549 if (rv != size)
bde89a9e 550 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 551
599377ac 552out:
b411b363
PR
553 return rv;
554}
555
bde89a9e 556static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
c6967746
AG
557{
558 int err;
559
bde89a9e 560 err = drbd_recv(connection, buf, size);
c6967746
AG
561 if (err != size) {
562 if (err >= 0)
563 err = -EIO;
564 } else
565 err = 0;
566 return err;
567}
568
bde89a9e 569static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
a5c31904
AG
570{
571 int err;
572
bde89a9e 573 err = drbd_recv_all(connection, buf, size);
a5c31904 574 if (err && !signal_pending(current))
1ec861eb 575 drbd_warn(connection, "short read (expected size %d)\n", (int)size);
a5c31904
AG
576 return err;
577}
578
5dbf1673
LE
579/* quoting tcp(7):
580 * On individual connections, the socket buffer size must be set prior to the
581 * listen(2) or connect(2) calls in order to have it take effect.
582 * This is our wrapper to do so.
583 */
584static void drbd_setbufsize(struct socket *sock, unsigned int snd,
585 unsigned int rcv)
586{
587 /* open coded SO_SNDBUF, SO_RCVBUF */
588 if (snd) {
589 sock->sk->sk_sndbuf = snd;
590 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
591 }
592 if (rcv) {
593 sock->sk->sk_rcvbuf = rcv;
594 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
595 }
596}
597
bde89a9e 598static struct socket *drbd_try_connect(struct drbd_connection *connection)
b411b363
PR
599{
600 const char *what;
601 struct socket *sock;
602 struct sockaddr_in6 src_in6;
44ed167d
PR
603 struct sockaddr_in6 peer_in6;
604 struct net_conf *nc;
605 int err, peer_addr_len, my_addr_len;
69ef82de 606 int sndbuf_size, rcvbuf_size, connect_int;
b411b363
PR
607 int disconnect_on_error = 1;
608
44ed167d 609 rcu_read_lock();
bde89a9e 610 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
611 if (!nc) {
612 rcu_read_unlock();
b411b363 613 return NULL;
44ed167d 614 }
44ed167d
PR
615 sndbuf_size = nc->sndbuf_size;
616 rcvbuf_size = nc->rcvbuf_size;
69ef82de 617 connect_int = nc->connect_int;
089c075d 618 rcu_read_unlock();
44ed167d 619
bde89a9e
AG
620 my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
621 memcpy(&src_in6, &connection->my_addr, my_addr_len);
44ed167d 622
bde89a9e 623 if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
44ed167d
PR
624 src_in6.sin6_port = 0;
625 else
626 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
627
bde89a9e
AG
628 peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
629 memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
b411b363
PR
630
631 what = "sock_create_kern";
eeb1bd5c 632 err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
44ed167d 633 SOCK_STREAM, IPPROTO_TCP, &sock);
b411b363
PR
634 if (err < 0) {
635 sock = NULL;
636 goto out;
637 }
638
639 sock->sk->sk_rcvtimeo =
69ef82de 640 sock->sk->sk_sndtimeo = connect_int * HZ;
44ed167d 641 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
b411b363
PR
642
643 /* explicitly bind to the configured IP as source IP
644 * for the outgoing connections.
645 * This is needed for multihomed hosts and to be
646 * able to use lo: interfaces for drbd.
647 * Make sure to use 0 as port number, so linux selects
648 * a free one dynamically.
649 */
b411b363 650 what = "bind before connect";
44ed167d 651 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
b411b363
PR
652 if (err < 0)
653 goto out;
654
655 /* connect may fail, peer not yet available.
656 * stay C_WF_CONNECTION, don't go Disconnecting! */
657 disconnect_on_error = 0;
658 what = "connect";
44ed167d 659 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
b411b363
PR
660
661out:
662 if (err < 0) {
663 if (sock) {
664 sock_release(sock);
665 sock = NULL;
666 }
667 switch (-err) {
668 /* timeout, busy, signal pending */
669 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
670 case EINTR: case ERESTARTSYS:
671 /* peer not (yet) available, network problem */
672 case ECONNREFUSED: case ENETUNREACH:
673 case EHOSTDOWN: case EHOSTUNREACH:
674 disconnect_on_error = 0;
675 break;
676 default:
1ec861eb 677 drbd_err(connection, "%s failed, err = %d\n", what, err);
b411b363
PR
678 }
679 if (disconnect_on_error)
bde89a9e 680 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 681 }
44ed167d 682
b411b363
PR
683 return sock;
684}
685
7a426fd8 686struct accept_wait_data {
bde89a9e 687 struct drbd_connection *connection;
7a426fd8
PR
688 struct socket *s_listen;
689 struct completion door_bell;
690 void (*original_sk_state_change)(struct sock *sk);
691
692};
693
715306f6 694static void drbd_incoming_connection(struct sock *sk)
7a426fd8
PR
695{
696 struct accept_wait_data *ad = sk->sk_user_data;
715306f6 697 void (*state_change)(struct sock *sk);
7a426fd8 698
715306f6
AG
699 state_change = ad->original_sk_state_change;
700 if (sk->sk_state == TCP_ESTABLISHED)
701 complete(&ad->door_bell);
702 state_change(sk);
7a426fd8
PR
703}
704
bde89a9e 705static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
b411b363 706{
1f3e509b 707 int err, sndbuf_size, rcvbuf_size, my_addr_len;
44ed167d 708 struct sockaddr_in6 my_addr;
1f3e509b 709 struct socket *s_listen;
44ed167d 710 struct net_conf *nc;
b411b363
PR
711 const char *what;
712
44ed167d 713 rcu_read_lock();
bde89a9e 714 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
715 if (!nc) {
716 rcu_read_unlock();
7a426fd8 717 return -EIO;
44ed167d 718 }
44ed167d
PR
719 sndbuf_size = nc->sndbuf_size;
720 rcvbuf_size = nc->rcvbuf_size;
44ed167d 721 rcu_read_unlock();
b411b363 722
bde89a9e
AG
723 my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
724 memcpy(&my_addr, &connection->my_addr, my_addr_len);
b411b363
PR
725
726 what = "sock_create_kern";
eeb1bd5c 727 err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
1f3e509b 728 SOCK_STREAM, IPPROTO_TCP, &s_listen);
b411b363
PR
729 if (err) {
730 s_listen = NULL;
731 goto out;
732 }
733
98683650 734 s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
44ed167d 735 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
b411b363
PR
736
737 what = "bind before listen";
44ed167d 738 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
b411b363
PR
739 if (err < 0)
740 goto out;
741
7a426fd8
PR
742 ad->s_listen = s_listen;
743 write_lock_bh(&s_listen->sk->sk_callback_lock);
744 ad->original_sk_state_change = s_listen->sk->sk_state_change;
715306f6 745 s_listen->sk->sk_state_change = drbd_incoming_connection;
7a426fd8
PR
746 s_listen->sk->sk_user_data = ad;
747 write_unlock_bh(&s_listen->sk->sk_callback_lock);
b411b363 748
2820fd39
PR
749 what = "listen";
750 err = s_listen->ops->listen(s_listen, 5);
751 if (err < 0)
752 goto out;
753
7a426fd8 754 return 0;
b411b363
PR
755out:
756 if (s_listen)
757 sock_release(s_listen);
758 if (err < 0) {
759 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1ec861eb 760 drbd_err(connection, "%s failed, err = %d\n", what, err);
bde89a9e 761 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
762 }
763 }
b411b363 764
7a426fd8 765 return -EIO;
b411b363
PR
766}
767
715306f6 768static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
b411b363 769{
715306f6
AG
770 write_lock_bh(&sk->sk_callback_lock);
771 sk->sk_state_change = ad->original_sk_state_change;
772 sk->sk_user_data = NULL;
773 write_unlock_bh(&sk->sk_callback_lock);
b411b363
PR
774}
775
bde89a9e 776static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
b411b363 777{
1f3e509b
PR
778 int timeo, connect_int, err = 0;
779 struct socket *s_estab = NULL;
1f3e509b
PR
780 struct net_conf *nc;
781
782 rcu_read_lock();
bde89a9e 783 nc = rcu_dereference(connection->net_conf);
1f3e509b
PR
784 if (!nc) {
785 rcu_read_unlock();
786 return NULL;
787 }
788 connect_int = nc->connect_int;
789 rcu_read_unlock();
790
791 timeo = connect_int * HZ;
38b682b2
AM
792 /* 28.5% random jitter */
793 timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
1f3e509b 794
7a426fd8
PR
795 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
796 if (err <= 0)
797 return NULL;
b411b363 798
7a426fd8 799 err = kernel_accept(ad->s_listen, &s_estab, 0);
b411b363
PR
800 if (err < 0) {
801 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1ec861eb 802 drbd_err(connection, "accept failed, err = %d\n", err);
bde89a9e 803 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
804 }
805 }
b411b363 806
715306f6
AG
807 if (s_estab)
808 unregister_state_change(s_estab->sk, ad);
b411b363 809
b411b363
PR
810 return s_estab;
811}
b411b363 812
bde89a9e 813static int decode_header(struct drbd_connection *, void *, struct packet_info *);
b411b363 814
bde89a9e 815static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
9f5bdc33
AG
816 enum drbd_packet cmd)
817{
bde89a9e 818 if (!conn_prepare_command(connection, sock))
9f5bdc33 819 return -EIO;
bde89a9e 820 return conn_send_command(connection, sock, cmd, 0, NULL, 0);
b411b363
PR
821}
822
bde89a9e 823static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
b411b363 824{
bde89a9e 825 unsigned int header_size = drbd_header_size(connection);
9f5bdc33 826 struct packet_info pi;
4920e37a 827 struct net_conf *nc;
9f5bdc33 828 int err;
b411b363 829
4920e37a
PR
830 rcu_read_lock();
831 nc = rcu_dereference(connection->net_conf);
832 if (!nc) {
833 rcu_read_unlock();
834 return -EIO;
835 }
836 sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
837 rcu_read_unlock();
838
bde89a9e 839 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
9f5bdc33
AG
840 if (err != header_size) {
841 if (err >= 0)
842 err = -EIO;
843 return err;
844 }
bde89a9e 845 err = decode_header(connection, connection->data.rbuf, &pi);
9f5bdc33
AG
846 if (err)
847 return err;
848 return pi.cmd;
b411b363
PR
849}
850
851/**
852 * drbd_socket_okay() - Free the socket if its connection is not okay
b411b363
PR
853 * @sock: pointer to the pointer to the socket.
854 */
5d0b17f1 855static bool drbd_socket_okay(struct socket **sock)
b411b363
PR
856{
857 int rr;
858 char tb[4];
859
860 if (!*sock)
81e84650 861 return false;
b411b363 862
dbd9eea0 863 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
b411b363
PR
864
865 if (rr > 0 || rr == -EAGAIN) {
81e84650 866 return true;
b411b363
PR
867 } else {
868 sock_release(*sock);
869 *sock = NULL;
81e84650 870 return false;
b411b363
PR
871 }
872}
5d0b17f1
PR
873
874static bool connection_established(struct drbd_connection *connection,
875 struct socket **sock1,
876 struct socket **sock2)
877{
878 struct net_conf *nc;
879 int timeout;
880 bool ok;
881
882 if (!*sock1 || !*sock2)
883 return false;
884
885 rcu_read_lock();
886 nc = rcu_dereference(connection->net_conf);
887 timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
888 rcu_read_unlock();
889 schedule_timeout_interruptible(timeout);
890
891 ok = drbd_socket_okay(sock1);
892 ok = drbd_socket_okay(sock2) && ok;
893
894 return ok;
895}
896
2325eb66
PR
897/* Gets called if a connection is established, or if a new minor gets created
898 in a connection */
69a22773 899int drbd_connected(struct drbd_peer_device *peer_device)
907599e0 900{
69a22773 901 struct drbd_device *device = peer_device->device;
0829f5ed 902 int err;
907599e0 903
b30ab791
AG
904 atomic_set(&device->packet_seq, 0);
905 device->peer_seq = 0;
907599e0 906
69a22773
AG
907 device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
908 &peer_device->connection->cstate_mutex :
b30ab791 909 &device->own_state_mutex;
8410da8f 910
69a22773 911 err = drbd_send_sync_param(peer_device);
0829f5ed 912 if (!err)
69a22773 913 err = drbd_send_sizes(peer_device, 0, 0);
0829f5ed 914 if (!err)
69a22773 915 err = drbd_send_uuids(peer_device);
0829f5ed 916 if (!err)
69a22773 917 err = drbd_send_current_state(peer_device);
b30ab791
AG
918 clear_bit(USE_DEGR_WFC_T, &device->flags);
919 clear_bit(RESIZE_PENDING, &device->flags);
920 atomic_set(&device->ap_in_flight, 0);
921 mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
0829f5ed 922 return err;
907599e0 923}
b411b363
PR
924
925/*
926 * return values:
927 * 1 yes, we have a valid connection
928 * 0 oops, did not work out, please try again
929 * -1 peer talks different language,
930 * no point in trying again, please go standalone.
931 * -2 We do not have a network config...
932 */
bde89a9e 933static int conn_connect(struct drbd_connection *connection)
b411b363 934{
7da35862 935 struct drbd_socket sock, msock;
c06ece6b 936 struct drbd_peer_device *peer_device;
44ed167d 937 struct net_conf *nc;
5d0b17f1
PR
938 int vnr, timeout, h;
939 bool discard_my_data, ok;
197296ff 940 enum drbd_state_rv rv;
7a426fd8 941 struct accept_wait_data ad = {
bde89a9e 942 .connection = connection,
7a426fd8
PR
943 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
944 };
b411b363 945
bde89a9e
AG
946 clear_bit(DISCONNECT_SENT, &connection->flags);
947 if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
948 return -2;
949
7da35862 950 mutex_init(&sock.mutex);
bde89a9e
AG
951 sock.sbuf = connection->data.sbuf;
952 sock.rbuf = connection->data.rbuf;
7da35862
PR
953 sock.socket = NULL;
954 mutex_init(&msock.mutex);
bde89a9e
AG
955 msock.sbuf = connection->meta.sbuf;
956 msock.rbuf = connection->meta.rbuf;
7da35862
PR
957 msock.socket = NULL;
958
0916e0e3 959 /* Assume that the peer only understands protocol 80 until we know better. */
bde89a9e 960 connection->agreed_pro_version = 80;
b411b363 961
bde89a9e 962 if (prepare_listen_socket(connection, &ad))
7a426fd8 963 return 0;
b411b363
PR
964
965 do {
2bf89621 966 struct socket *s;
b411b363 967
bde89a9e 968 s = drbd_try_connect(connection);
b411b363 969 if (s) {
7da35862
PR
970 if (!sock.socket) {
971 sock.socket = s;
bde89a9e 972 send_first_packet(connection, &sock, P_INITIAL_DATA);
7da35862 973 } else if (!msock.socket) {
bde89a9e 974 clear_bit(RESOLVE_CONFLICTS, &connection->flags);
7da35862 975 msock.socket = s;
bde89a9e 976 send_first_packet(connection, &msock, P_INITIAL_META);
b411b363 977 } else {
1ec861eb 978 drbd_err(connection, "Logic error in conn_connect()\n");
b411b363
PR
979 goto out_release_sockets;
980 }
981 }
982
5d0b17f1
PR
983 if (connection_established(connection, &sock.socket, &msock.socket))
984 break;
b411b363
PR
985
986retry:
bde89a9e 987 s = drbd_wait_for_connect(connection, &ad);
b411b363 988 if (s) {
bde89a9e 989 int fp = receive_first_packet(connection, s);
7da35862
PR
990 drbd_socket_okay(&sock.socket);
991 drbd_socket_okay(&msock.socket);
92f14951 992 switch (fp) {
e5d6f33a 993 case P_INITIAL_DATA:
7da35862 994 if (sock.socket) {
1ec861eb 995 drbd_warn(connection, "initial packet S crossed\n");
7da35862 996 sock_release(sock.socket);
80c6eed4
PR
997 sock.socket = s;
998 goto randomize;
b411b363 999 }
7da35862 1000 sock.socket = s;
b411b363 1001 break;
e5d6f33a 1002 case P_INITIAL_META:
bde89a9e 1003 set_bit(RESOLVE_CONFLICTS, &connection->flags);
7da35862 1004 if (msock.socket) {
1ec861eb 1005 drbd_warn(connection, "initial packet M crossed\n");
7da35862 1006 sock_release(msock.socket);
80c6eed4
PR
1007 msock.socket = s;
1008 goto randomize;
b411b363 1009 }
7da35862 1010 msock.socket = s;
b411b363
PR
1011 break;
1012 default:
1ec861eb 1013 drbd_warn(connection, "Error receiving initial packet\n");
b411b363 1014 sock_release(s);
80c6eed4 1015randomize:
38b682b2 1016 if (prandom_u32() & 1)
b411b363
PR
1017 goto retry;
1018 }
1019 }
1020
bde89a9e 1021 if (connection->cstate <= C_DISCONNECTING)
b411b363
PR
1022 goto out_release_sockets;
1023 if (signal_pending(current)) {
1024 flush_signals(current);
1025 smp_rmb();
bde89a9e 1026 if (get_t_state(&connection->receiver) == EXITING)
b411b363
PR
1027 goto out_release_sockets;
1028 }
1029
5d0b17f1 1030 ok = connection_established(connection, &sock.socket, &msock.socket);
b666dbf8 1031 } while (!ok);
b411b363 1032
7a426fd8
PR
1033 if (ad.s_listen)
1034 sock_release(ad.s_listen);
b411b363 1035
98683650
PR
1036 sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1037 msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
b411b363 1038
7da35862
PR
1039 sock.socket->sk->sk_allocation = GFP_NOIO;
1040 msock.socket->sk->sk_allocation = GFP_NOIO;
b411b363 1041
7da35862
PR
1042 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1043 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
b411b363 1044
b411b363 1045 /* NOT YET ...
bde89a9e 1046 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
7da35862 1047 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
6038178e 1048 * first set it to the P_CONNECTION_FEATURES timeout,
b411b363 1049 * which we set to 4x the configured ping_timeout. */
44ed167d 1050 rcu_read_lock();
bde89a9e 1051 nc = rcu_dereference(connection->net_conf);
44ed167d 1052
7da35862
PR
1053 sock.socket->sk->sk_sndtimeo =
1054 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
b411b363 1055
7da35862 1056 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
44ed167d 1057 timeout = nc->timeout * HZ / 10;
08b165ba 1058 discard_my_data = nc->discard_my_data;
44ed167d 1059 rcu_read_unlock();
b411b363 1060
7da35862 1061 msock.socket->sk->sk_sndtimeo = timeout;
b411b363
PR
1062
1063 /* we don't want delays.
25985edc 1064 * we use TCP_CORK where appropriate, though */
7da35862
PR
1065 drbd_tcp_nodelay(sock.socket);
1066 drbd_tcp_nodelay(msock.socket);
b411b363 1067
bde89a9e
AG
1068 connection->data.socket = sock.socket;
1069 connection->meta.socket = msock.socket;
1070 connection->last_received = jiffies;
b411b363 1071
bde89a9e 1072 h = drbd_do_features(connection);
b411b363
PR
1073 if (h <= 0)
1074 return h;
1075
bde89a9e 1076 if (connection->cram_hmac_tfm) {
b30ab791 1077 /* drbd_request_state(device, NS(conn, WFAuth)); */
bde89a9e 1078 switch (drbd_do_auth(connection)) {
b10d96cb 1079 case -1:
1ec861eb 1080 drbd_err(connection, "Authentication of peer failed\n");
b411b363 1081 return -1;
b10d96cb 1082 case 0:
1ec861eb 1083 drbd_err(connection, "Authentication of peer failed, trying again.\n");
b10d96cb 1084 return 0;
b411b363
PR
1085 }
1086 }
1087
bde89a9e
AG
1088 connection->data.socket->sk->sk_sndtimeo = timeout;
1089 connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
b411b363 1090
bde89a9e 1091 if (drbd_send_protocol(connection) == -EOPNOTSUPP)
7e2455c1 1092 return -1;
b411b363 1093
31007745
PR
1094 /* Prevent a race between resync-handshake and
1095 * being promoted to Primary.
1096 *
1097 * Grab and release the state mutex, so we know that any current
1098 * drbd_set_role() is finished, and any incoming drbd_set_role
1099 * will see the STATE_SENT flag, and wait for it to be cleared.
1100 */
1101 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1102 mutex_lock(peer_device->device->state_mutex);
1103
cde81d99
LE
1104 /* avoid a race with conn_request_state( C_DISCONNECTING ) */
1105 spin_lock_irq(&connection->resource->req_lock);
bde89a9e 1106 set_bit(STATE_SENT, &connection->flags);
cde81d99 1107 spin_unlock_irq(&connection->resource->req_lock);
a1096a6e 1108
31007745
PR
1109 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1110 mutex_unlock(peer_device->device->state_mutex);
1111
c141ebda 1112 rcu_read_lock();
c06ece6b
AG
1113 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1114 struct drbd_device *device = peer_device->device;
b30ab791 1115 kref_get(&device->kref);
26ea8f92
AG
1116 rcu_read_unlock();
1117
08b165ba 1118 if (discard_my_data)
b30ab791 1119 set_bit(DISCARD_MY_DATA, &device->flags);
08b165ba 1120 else
b30ab791 1121 clear_bit(DISCARD_MY_DATA, &device->flags);
08b165ba 1122
69a22773 1123 drbd_connected(peer_device);
05a10ec7 1124 kref_put(&device->kref, drbd_destroy_device);
c141ebda
PR
1125 rcu_read_lock();
1126 }
1127 rcu_read_unlock();
1128
bde89a9e
AG
1129 rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1130 if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1131 clear_bit(STATE_SENT, &connection->flags);
1e86ac48 1132 return 0;
a1096a6e 1133 }
1e86ac48 1134
1c03e520 1135 drbd_thread_start(&connection->ack_receiver);
39e91a60
LE
1136 /* opencoded create_singlethread_workqueue(),
1137 * to be able to use format string arguments */
1138 connection->ack_sender =
1139 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
668700b4
PR
1140 if (!connection->ack_sender) {
1141 drbd_err(connection, "Failed to create workqueue ack_sender\n");
1142 return 0;
1143 }
b411b363 1144
0500813f 1145 mutex_lock(&connection->resource->conf_update);
08b165ba
PR
1146 /* The discard_my_data flag is a single-shot modifier to the next
1147 * connection attempt, the handshake of which is now well underway.
1148 * No need for rcu style copying of the whole struct
1149 * just to clear a single value. */
bde89a9e 1150 connection->net_conf->discard_my_data = 0;
0500813f 1151 mutex_unlock(&connection->resource->conf_update);
08b165ba 1152
d3fcb490 1153 return h;
b411b363
PR
1154
1155out_release_sockets:
7a426fd8
PR
1156 if (ad.s_listen)
1157 sock_release(ad.s_listen);
7da35862
PR
1158 if (sock.socket)
1159 sock_release(sock.socket);
1160 if (msock.socket)
1161 sock_release(msock.socket);
b411b363
PR
1162 return -1;
1163}
1164
bde89a9e 1165static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
b411b363 1166{
bde89a9e 1167 unsigned int header_size = drbd_header_size(connection);
e658983a 1168
0c8e36d9
AG
1169 if (header_size == sizeof(struct p_header100) &&
1170 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1171 struct p_header100 *h = header;
1172 if (h->pad != 0) {
1ec861eb 1173 drbd_err(connection, "Header padding is not zero\n");
0c8e36d9
AG
1174 return -EINVAL;
1175 }
1176 pi->vnr = be16_to_cpu(h->volume);
1177 pi->cmd = be16_to_cpu(h->command);
1178 pi->size = be32_to_cpu(h->length);
1179 } else if (header_size == sizeof(struct p_header95) &&
1180 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
e658983a 1181 struct p_header95 *h = header;
e658983a 1182 pi->cmd = be16_to_cpu(h->command);
b55d84ba
AG
1183 pi->size = be32_to_cpu(h->length);
1184 pi->vnr = 0;
e658983a
AG
1185 } else if (header_size == sizeof(struct p_header80) &&
1186 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1187 struct p_header80 *h = header;
1188 pi->cmd = be16_to_cpu(h->command);
1189 pi->size = be16_to_cpu(h->length);
77351055 1190 pi->vnr = 0;
02918be2 1191 } else {
1ec861eb 1192 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
e658983a 1193 be32_to_cpu(*(__be32 *)header),
bde89a9e 1194 connection->agreed_pro_version);
8172f3e9 1195 return -EINVAL;
b411b363 1196 }
e658983a 1197 pi->data = header + header_size;
8172f3e9 1198 return 0;
257d0af6 1199}
b411b363 1200
c51a0ef3
LE
1201static void drbd_unplug_all_devices(struct drbd_connection *connection)
1202{
1203 if (current->plug == &connection->receiver_plug) {
1204 blk_finish_plug(&connection->receiver_plug);
1205 blk_start_plug(&connection->receiver_plug);
1206 } /* else: maybe just schedule() ?? */
1207}
1208
bde89a9e 1209static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
257d0af6 1210{
bde89a9e 1211 void *buffer = connection->data.rbuf;
69bc7bc3 1212 int err;
257d0af6 1213
bde89a9e 1214 err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
a5c31904 1215 if (err)
69bc7bc3 1216 return err;
257d0af6 1217
bde89a9e
AG
1218 err = decode_header(connection, buffer, pi);
1219 connection->last_received = jiffies;
b411b363 1220
69bc7bc3 1221 return err;
b411b363
PR
1222}
1223
c51a0ef3
LE
1224static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi)
1225{
1226 void *buffer = connection->data.rbuf;
1227 unsigned int size = drbd_header_size(connection);
1228 int err;
1229
1230 err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT);
1231 if (err != size) {
1232 /* If we have nothing in the receive buffer now, to reduce
1233 * application latency, try to drain the backend queues as
1234 * quickly as possible, and let remote TCP know what we have
1235 * received so far. */
1236 if (err == -EAGAIN) {
1237 drbd_tcp_quickack(connection->data.socket);
1238 drbd_unplug_all_devices(connection);
1239 }
1240 if (err > 0) {
1241 buffer += err;
1242 size -= err;
1243 }
1244 err = drbd_recv_all_warn(connection, buffer, size);
1245 if (err)
1246 return err;
1247 }
1248
1249 err = decode_header(connection, connection->data.rbuf, pi);
1250 connection->last_received = jiffies;
1251
1252 return err;
1253}
f9ff0da5
LE
1254/* This is blkdev_issue_flush, but asynchronous.
1255 * We want to submit to all component volumes in parallel,
1256 * then wait for all completions.
1257 */
1258struct issue_flush_context {
1259 atomic_t pending;
1260 int error;
1261 struct completion done;
1262};
1263struct one_flush_context {
1264 struct drbd_device *device;
1265 struct issue_flush_context *ctx;
1266};
1267
1ffa7bfa 1268static void one_flush_endio(struct bio *bio)
b411b363 1269{
f9ff0da5
LE
1270 struct one_flush_context *octx = bio->bi_private;
1271 struct drbd_device *device = octx->device;
1272 struct issue_flush_context *ctx = octx->ctx;
1273
4e4cbee9
CH
1274 if (bio->bi_status) {
1275 ctx->error = blk_status_to_errno(bio->bi_status);
1276 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
f9ff0da5
LE
1277 }
1278 kfree(octx);
1279 bio_put(bio);
1280
1281 clear_bit(FLUSH_PENDING, &device->flags);
1282 put_ldev(device);
1283 kref_put(&device->kref, drbd_destroy_device);
1284
1285 if (atomic_dec_and_test(&ctx->pending))
1286 complete(&ctx->done);
1287}
1288
1289static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1290{
1291 struct bio *bio = bio_alloc(GFP_NOIO, 0);
1292 struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
1293 if (!bio || !octx) {
1294 drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
1295 /* FIXME: what else can I do now? disconnecting or detaching
1296 * really does not help to improve the state of the world, either.
1297 */
1298 kfree(octx);
1299 if (bio)
1300 bio_put(bio);
1301
1302 ctx->error = -ENOMEM;
1303 put_ldev(device);
1304 kref_put(&device->kref, drbd_destroy_device);
1305 return;
1306 }
4b0007c0 1307
f9ff0da5
LE
1308 octx->device = device;
1309 octx->ctx = ctx;
74d46992 1310 bio_set_dev(bio, device->ldev->backing_bdev);
f9ff0da5
LE
1311 bio->bi_private = octx;
1312 bio->bi_end_io = one_flush_endio;
70fd7614 1313 bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
f9ff0da5
LE
1314
1315 device->flush_jif = jiffies;
1316 set_bit(FLUSH_PENDING, &device->flags);
1317 atomic_inc(&ctx->pending);
1318 submit_bio(bio);
1319}
1320
1321static void drbd_flush(struct drbd_connection *connection)
1322{
f6ba8636 1323 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
f9ff0da5
LE
1324 struct drbd_peer_device *peer_device;
1325 struct issue_flush_context ctx;
1326 int vnr;
1327
1328 atomic_set(&ctx.pending, 1);
1329 ctx.error = 0;
1330 init_completion(&ctx.done);
1331
615e087f 1332 rcu_read_lock();
c06ece6b
AG
1333 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1334 struct drbd_device *device = peer_device->device;
1335
b30ab791 1336 if (!get_ldev(device))
615e087f 1337 continue;
b30ab791 1338 kref_get(&device->kref);
615e087f
LE
1339 rcu_read_unlock();
1340
f9ff0da5 1341 submit_one_flush(device, &ctx);
b411b363 1342
615e087f 1343 rcu_read_lock();
b411b363 1344 }
615e087f 1345 rcu_read_unlock();
f9ff0da5
LE
1346
1347 /* Do we want to add a timeout,
1348 * if disk-timeout is set? */
1349 if (!atomic_dec_and_test(&ctx.pending))
1350 wait_for_completion(&ctx.done);
1351
1352 if (ctx.error) {
1353 /* would rather check on EOPNOTSUPP, but that is not reliable.
1354 * don't try again for ANY return value != 0
1355 * if (rv == -EOPNOTSUPP) */
1356 /* Any error is already reported by bio_endio callback. */
1357 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1358 }
b411b363 1359 }
b411b363
PR
1360}
1361
1362/**
1363 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
b30ab791 1364 * @device: DRBD device.
b411b363
PR
1365 * @epoch: Epoch object.
1366 * @ev: Epoch event.
1367 */
bde89a9e 1368static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
b411b363
PR
1369 struct drbd_epoch *epoch,
1370 enum epoch_event ev)
1371{
2451fc3b 1372 int epoch_size;
b411b363 1373 struct drbd_epoch *next_epoch;
b411b363
PR
1374 enum finish_epoch rv = FE_STILL_LIVE;
1375
bde89a9e 1376 spin_lock(&connection->epoch_lock);
b411b363
PR
1377 do {
1378 next_epoch = NULL;
b411b363
PR
1379
1380 epoch_size = atomic_read(&epoch->epoch_size);
1381
1382 switch (ev & ~EV_CLEANUP) {
1383 case EV_PUT:
1384 atomic_dec(&epoch->active);
1385 break;
1386 case EV_GOT_BARRIER_NR:
1387 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
b411b363
PR
1388 break;
1389 case EV_BECAME_LAST:
1390 /* nothing to do*/
1391 break;
1392 }
1393
b411b363
PR
1394 if (epoch_size != 0 &&
1395 atomic_read(&epoch->active) == 0 &&
80f9fd55 1396 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
b411b363 1397 if (!(ev & EV_CLEANUP)) {
bde89a9e
AG
1398 spin_unlock(&connection->epoch_lock);
1399 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1400 spin_lock(&connection->epoch_lock);
b411b363 1401 }
9ed57dcb
LE
1402#if 0
1403 /* FIXME: dec unacked on connection, once we have
1404 * something to count pending connection packets in. */
80f9fd55 1405 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
bde89a9e 1406 dec_unacked(epoch->connection);
9ed57dcb 1407#endif
b411b363 1408
bde89a9e 1409 if (connection->current_epoch != epoch) {
b411b363
PR
1410 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1411 list_del(&epoch->list);
1412 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
bde89a9e 1413 connection->epochs--;
b411b363
PR
1414 kfree(epoch);
1415
1416 if (rv == FE_STILL_LIVE)
1417 rv = FE_DESTROYED;
1418 } else {
1419 epoch->flags = 0;
1420 atomic_set(&epoch->epoch_size, 0);
698f9315 1421 /* atomic_set(&epoch->active, 0); is already zero */
b411b363
PR
1422 if (rv == FE_STILL_LIVE)
1423 rv = FE_RECYCLED;
1424 }
1425 }
1426
1427 if (!next_epoch)
1428 break;
1429
1430 epoch = next_epoch;
1431 } while (1);
1432
bde89a9e 1433 spin_unlock(&connection->epoch_lock);
b411b363 1434
b411b363
PR
1435 return rv;
1436}
1437
8fe39aac
PR
1438static enum write_ordering_e
1439max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1440{
1441 struct disk_conf *dc;
1442
1443 dc = rcu_dereference(bdev->disk_conf);
1444
f6ba8636
AG
1445 if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
1446 wo = WO_DRAIN_IO;
1447 if (wo == WO_DRAIN_IO && !dc->disk_drain)
1448 wo = WO_NONE;
8fe39aac
PR
1449
1450 return wo;
1451}
1452
b411b363
PR
1453/**
1454 * drbd_bump_write_ordering() - Fall back to an other write ordering method
bde89a9e 1455 * @connection: DRBD connection.
b411b363
PR
1456 * @wo: Write ordering method to try.
1457 */
8fe39aac
PR
1458void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1459 enum write_ordering_e wo)
b411b363 1460{
e9526580 1461 struct drbd_device *device;
b411b363 1462 enum write_ordering_e pwo;
4b0007c0 1463 int vnr;
b411b363 1464 static char *write_ordering_str[] = {
f6ba8636
AG
1465 [WO_NONE] = "none",
1466 [WO_DRAIN_IO] = "drain",
1467 [WO_BDEV_FLUSH] = "flush",
b411b363
PR
1468 };
1469
e9526580 1470 pwo = resource->write_ordering;
f6ba8636 1471 if (wo != WO_BDEV_FLUSH)
70df7092 1472 wo = min(pwo, wo);
daeda1cc 1473 rcu_read_lock();
e9526580 1474 idr_for_each_entry(&resource->devices, device, vnr) {
8fe39aac
PR
1475 if (get_ldev(device)) {
1476 wo = max_allowed_wo(device->ldev, wo);
1477 if (device->ldev == bdev)
1478 bdev = NULL;
1479 put_ldev(device);
1480 }
4b0007c0 1481 }
8fe39aac
PR
1482
1483 if (bdev)
1484 wo = max_allowed_wo(bdev, wo);
1485
70df7092
LE
1486 rcu_read_unlock();
1487
e9526580 1488 resource->write_ordering = wo;
f6ba8636 1489 if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
e9526580 1490 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
b411b363
PR
1491}
1492
f31e583a
LE
1493/*
1494 * Mapping "discard" to ZEROOUT with UNMAP does not work for us:
1495 * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it
1496 * will directly go to fallback mode, submitting normal writes, and
1497 * never even try to UNMAP.
1498 *
1499 * And dm-thin does not do this (yet), mostly because in general it has
1500 * to assume that "skip_block_zeroing" is set. See also:
1501 * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
1502 * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html
1503 *
1504 * We *may* ignore the discard-zeroes-data setting, if so configured.
1505 *
1506 * Assumption is that this "discard_zeroes_data=0" is only because the backend
1507 * may ignore partial unaligned discards.
1508 *
1509 * LVM/DM thin as of at least
1510 * LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
1511 * Library version: 1.02.93-RHEL7 (2015-01-28)
1512 * Driver version: 4.29.0
1513 * still behaves this way.
1514 *
1515 * For unaligned (wrt. alignment and granularity) or too small discards,
1516 * we zero-out the initial (and/or) trailing unaligned partial chunks,
1517 * but discard all the aligned full chunks.
1518 *
1519 * At least for LVM/DM thin, with skip_block_zeroing=false,
1520 * the result is effectively "discard_zeroes_data=1".
1521 */
1522/* flags: EE_TRIM|EE_ZEROOUT */
1523int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags)
dd4f699d
LE
1524{
1525 struct block_device *bdev = device->ldev->backing_bdev;
f31e583a
LE
1526 struct request_queue *q = bdev_get_queue(bdev);
1527 sector_t tmp, nr;
1528 unsigned int max_discard_sectors, granularity;
1529 int alignment;
1530 int err = 0;
dd4f699d 1531
f31e583a
LE
1532 if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM))
1533 goto zero_out;
1534
1535 /* Zero-sector (unknown) and one-sector granularities are the same. */
1536 granularity = max(q->limits.discard_granularity >> 9, 1U);
1537 alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
1538
1539 max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
1540 max_discard_sectors -= max_discard_sectors % granularity;
1541 if (unlikely(!max_discard_sectors))
1542 goto zero_out;
1543
1544 if (nr_sectors < granularity)
1545 goto zero_out;
1546
1547 tmp = start;
1548 if (sector_div(tmp, granularity) != alignment) {
1549 if (nr_sectors < 2*granularity)
1550 goto zero_out;
1551 /* start + gran - (start + gran - align) % gran */
1552 tmp = start + granularity - alignment;
1553 tmp = start + granularity - sector_div(tmp, granularity);
1554
1555 nr = tmp - start;
1556 /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many
1557 * layers are below us, some may have smaller granularity */
1558 err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
1559 nr_sectors -= nr;
1560 start = tmp;
1561 }
1562 while (nr_sectors >= max_discard_sectors) {
1563 err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0);
1564 nr_sectors -= max_discard_sectors;
1565 start += max_discard_sectors;
1566 }
1567 if (nr_sectors) {
1568 /* max_discard_sectors is unsigned int (and a multiple of
1569 * granularity, we made sure of that above already);
1570 * nr is < max_discard_sectors;
1571 * I don't need sector_div here, even though nr is sector_t */
1572 nr = nr_sectors;
1573 nr -= (unsigned int)nr % granularity;
1574 if (nr) {
1575 err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
1576 nr_sectors -= nr;
1577 start += nr;
1578 }
1579 }
1580 zero_out:
1581 if (nr_sectors) {
1582 err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
1583 (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP);
1584 }
1585 return err != 0;
1586}
0dbed96a 1587
f31e583a
LE
1588static bool can_do_reliable_discards(struct drbd_device *device)
1589{
1590 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
1591 struct disk_conf *dc;
1592 bool can_do;
0dbed96a 1593
f31e583a
LE
1594 if (!blk_queue_discard(q))
1595 return false;
1596
1597 rcu_read_lock();
1598 dc = rcu_dereference(device->ldev->disk_conf);
1599 can_do = dc->discard_zeroes_if_aligned;
1600 rcu_read_unlock();
1601 return can_do;
1602}
1603
1604static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req)
1605{
1606 /* If the backend cannot discard, or does not guarantee
1607 * read-back zeroes in discarded ranges, we fall back to
1608 * zero-out. Unless configuration specifically requested
1609 * otherwise. */
1610 if (!can_do_reliable_discards(device))
1611 peer_req->flags |= EE_ZEROOUT;
1612
1613 if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
1614 peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM)))
1615 peer_req->flags |= EE_WAS_ERROR;
dd4f699d
LE
1616 drbd_endio_write_sec_final(peer_req);
1617}
1618
9104d31a
LE
1619static void drbd_issue_peer_wsame(struct drbd_device *device,
1620 struct drbd_peer_request *peer_req)
1621{
1622 struct block_device *bdev = device->ldev->backing_bdev;
1623 sector_t s = peer_req->i.sector;
1624 sector_t nr = peer_req->i.size >> 9;
1625 if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
1626 peer_req->flags |= EE_WAS_ERROR;
1627 drbd_endio_write_sec_final(peer_req);
1628}
1629
1630
45bb912b 1631/**
fbe29dec 1632 * drbd_submit_peer_request()
b30ab791 1633 * @device: DRBD device.
db830c46 1634 * @peer_req: peer request
1eff9d32 1635 * @rw: flag field, see bio->bi_opf
10f6d992
LE
1636 *
1637 * May spread the pages to multiple bios,
1638 * depending on bio_add_page restrictions.
1639 *
1640 * Returns 0 if all bios have been submitted,
1641 * -ENOMEM if we could not allocate enough bios,
1642 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1643 * single page to an empty bio (which should never happen and likely indicates
1644 * that the lower level IO stack is in some way broken). This has been observed
1645 * on certain Xen deployments.
45bb912b
LE
1646 */
1647/* TODO allocate from our own bio_set. */
b30ab791 1648int drbd_submit_peer_request(struct drbd_device *device,
fbe29dec 1649 struct drbd_peer_request *peer_req,
bb3cc85e
MC
1650 const unsigned op, const unsigned op_flags,
1651 const int fault_type)
45bb912b
LE
1652{
1653 struct bio *bios = NULL;
1654 struct bio *bio;
db830c46
AG
1655 struct page *page = peer_req->pages;
1656 sector_t sector = peer_req->i.sector;
11f8b2b6 1657 unsigned data_size = peer_req->i.size;
45bb912b 1658 unsigned n_bios = 0;
11f8b2b6 1659 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
10f6d992 1660 int err = -ENOMEM;
45bb912b 1661
dd4f699d
LE
1662 /* TRIM/DISCARD: for now, always use the helper function
1663 * blkdev_issue_zeroout(..., discard=true).
1664 * It's synchronous, but it does the right thing wrt. bio splitting.
1665 * Correctness first, performance later. Next step is to code an
1666 * asynchronous variant of the same.
1667 */
f31e583a 1668 if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) {
a0fb3c47
LE
1669 /* wait for all pending IO completions, before we start
1670 * zeroing things out. */
5dd2ca19 1671 conn_wait_active_ee_empty(peer_req->peer_device->connection);
45d2933c
LE
1672 /* add it to the active list now,
1673 * so we can find it to present it in debugfs */
21ae5d7f
LE
1674 peer_req->submit_jif = jiffies;
1675 peer_req->flags |= EE_SUBMITTED;
700ca8c0
PR
1676
1677 /* If this was a resync request from receive_rs_deallocated(),
1678 * it is already on the sync_ee list */
1679 if (list_empty(&peer_req->w.list)) {
1680 spin_lock_irq(&device->resource->req_lock);
1681 list_add_tail(&peer_req->w.list, &device->active_ee);
1682 spin_unlock_irq(&device->resource->req_lock);
1683 }
1684
f31e583a
LE
1685 if (peer_req->flags & (EE_TRIM|EE_ZEROOUT))
1686 drbd_issue_peer_discard_or_zero_out(device, peer_req);
9104d31a
LE
1687 else /* EE_WRITE_SAME */
1688 drbd_issue_peer_wsame(device, peer_req);
a0fb3c47
LE
1689 return 0;
1690 }
1691
45bb912b
LE
1692 /* In most cases, we will only need one bio. But in case the lower
1693 * level restrictions happen to be different at this offset on this
1694 * side than those of the sending peer, we may need to submit the
9476f39d
LE
1695 * request in more than one bio.
1696 *
1697 * Plain bio_alloc is good enough here, this is no DRBD internally
1698 * generated bio, but a bio allocated on behalf of the peer.
1699 */
45bb912b
LE
1700next_bio:
1701 bio = bio_alloc(GFP_NOIO, nr_pages);
1702 if (!bio) {
a0fb3c47 1703 drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
45bb912b
LE
1704 goto fail;
1705 }
db830c46 1706 /* > peer_req->i.sector, unless this is the first bio */
4f024f37 1707 bio->bi_iter.bi_sector = sector;
74d46992 1708 bio_set_dev(bio, device->ldev->backing_bdev);
bb3cc85e 1709 bio_set_op_attrs(bio, op, op_flags);
db830c46 1710 bio->bi_private = peer_req;
fcefa62e 1711 bio->bi_end_io = drbd_peer_request_endio;
45bb912b
LE
1712
1713 bio->bi_next = bios;
1714 bios = bio;
1715 ++n_bios;
1716
1717 page_chain_for_each(page) {
11f8b2b6 1718 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
06efffda 1719 if (!bio_add_page(bio, page, len, 0))
45bb912b 1720 goto next_bio;
11f8b2b6 1721 data_size -= len;
45bb912b
LE
1722 sector += len >> 9;
1723 --nr_pages;
1724 }
11f8b2b6 1725 D_ASSERT(device, data_size == 0);
a0fb3c47 1726 D_ASSERT(device, page == NULL);
45bb912b 1727
db830c46 1728 atomic_set(&peer_req->pending_bios, n_bios);
21ae5d7f
LE
1729 /* for debugfs: update timestamp, mark as submitted */
1730 peer_req->submit_jif = jiffies;
1731 peer_req->flags |= EE_SUBMITTED;
45bb912b
LE
1732 do {
1733 bio = bios;
1734 bios = bios->bi_next;
1735 bio->bi_next = NULL;
1736
b30ab791 1737 drbd_generic_make_request(device, fault_type, bio);
45bb912b 1738 } while (bios);
45bb912b
LE
1739 return 0;
1740
1741fail:
1742 while (bios) {
1743 bio = bios;
1744 bios = bios->bi_next;
1745 bio_put(bio);
1746 }
10f6d992 1747 return err;
45bb912b
LE
1748}
1749
b30ab791 1750static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
db830c46 1751 struct drbd_peer_request *peer_req)
53840641 1752{
db830c46 1753 struct drbd_interval *i = &peer_req->i;
53840641 1754
b30ab791 1755 drbd_remove_interval(&device->write_requests, i);
53840641
AG
1756 drbd_clear_interval(i);
1757
6c852bec 1758 /* Wake up any processes waiting for this peer request to complete. */
53840641 1759 if (i->waiting)
b30ab791 1760 wake_up(&device->misc_wait);
53840641
AG
1761}
1762
bde89a9e 1763static void conn_wait_active_ee_empty(struct drbd_connection *connection)
77fede51 1764{
c06ece6b 1765 struct drbd_peer_device *peer_device;
77fede51
PR
1766 int vnr;
1767
1768 rcu_read_lock();
c06ece6b
AG
1769 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1770 struct drbd_device *device = peer_device->device;
1771
b30ab791 1772 kref_get(&device->kref);
77fede51 1773 rcu_read_unlock();
b30ab791 1774 drbd_wait_ee_list_empty(device, &device->active_ee);
05a10ec7 1775 kref_put(&device->kref, drbd_destroy_device);
77fede51
PR
1776 rcu_read_lock();
1777 }
1778 rcu_read_unlock();
1779}
1780
bde89a9e 1781static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
b411b363 1782{
2451fc3b 1783 int rv;
e658983a 1784 struct p_barrier *p = pi->data;
b411b363
PR
1785 struct drbd_epoch *epoch;
1786
9ed57dcb
LE
1787 /* FIXME these are unacked on connection,
1788 * not a specific (peer)device.
1789 */
bde89a9e
AG
1790 connection->current_epoch->barrier_nr = p->barrier;
1791 connection->current_epoch->connection = connection;
1792 rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
b411b363
PR
1793
1794 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1795 * the activity log, which means it would not be resynced in case the
1796 * R_PRIMARY crashes now.
1797 * Therefore we must send the barrier_ack after the barrier request was
1798 * completed. */
e9526580 1799 switch (connection->resource->write_ordering) {
f6ba8636 1800 case WO_NONE:
b411b363 1801 if (rv == FE_RECYCLED)
82bc0194 1802 return 0;
2451fc3b
PR
1803
1804 /* receiver context, in the writeout path of the other node.
1805 * avoid potential distributed deadlock */
1806 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1807 if (epoch)
1808 break;
1809 else
1ec861eb 1810 drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
2451fc3b 1811 /* Fall through */
b411b363 1812
f6ba8636
AG
1813 case WO_BDEV_FLUSH:
1814 case WO_DRAIN_IO:
bde89a9e
AG
1815 conn_wait_active_ee_empty(connection);
1816 drbd_flush(connection);
2451fc3b 1817
bde89a9e 1818 if (atomic_read(&connection->current_epoch->epoch_size)) {
2451fc3b
PR
1819 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1820 if (epoch)
1821 break;
b411b363
PR
1822 }
1823
82bc0194 1824 return 0;
2451fc3b 1825 default:
e9526580
PR
1826 drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1827 connection->resource->write_ordering);
82bc0194 1828 return -EIO;
b411b363
PR
1829 }
1830
1831 epoch->flags = 0;
1832 atomic_set(&epoch->epoch_size, 0);
1833 atomic_set(&epoch->active, 0);
1834
bde89a9e
AG
1835 spin_lock(&connection->epoch_lock);
1836 if (atomic_read(&connection->current_epoch->epoch_size)) {
1837 list_add(&epoch->list, &connection->current_epoch->list);
1838 connection->current_epoch = epoch;
1839 connection->epochs++;
b411b363
PR
1840 } else {
1841 /* The current_epoch got recycled while we allocated this one... */
1842 kfree(epoch);
1843 }
bde89a9e 1844 spin_unlock(&connection->epoch_lock);
b411b363 1845
82bc0194 1846 return 0;
b411b363
PR
1847}
1848
9104d31a 1849/* quick wrapper in case payload size != request_size (write same) */
3d0e6375 1850static void drbd_csum_ee_size(struct crypto_shash *h,
9104d31a
LE
1851 struct drbd_peer_request *r, void *d,
1852 unsigned int payload_size)
1853{
1854 unsigned int tmp = r->i.size;
1855 r->i.size = payload_size;
1856 drbd_csum_ee(h, r, d);
1857 r->i.size = tmp;
1858}
1859
b411b363 1860/* used from receive_RSDataReply (recv_resync_read)
9104d31a
LE
1861 * and from receive_Data.
1862 * data_size: actual payload ("data in")
1863 * for normal writes that is bi_size.
1864 * for discards, that is zero.
1865 * for write same, it is logical_block_size.
1866 * both trim and write same have the bi_size ("data len to be affected")
1867 * as extra argument in the packet header.
1868 */
f6ffca9f 1869static struct drbd_peer_request *
69a22773 1870read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
a0fb3c47 1871 struct packet_info *pi) __must_hold(local)
b411b363 1872{
69a22773 1873 struct drbd_device *device = peer_device->device;
b30ab791 1874 const sector_t capacity = drbd_get_capacity(device->this_bdev);
db830c46 1875 struct drbd_peer_request *peer_req;
b411b363 1876 struct page *page;
11f8b2b6
AG
1877 int digest_size, err;
1878 unsigned int data_size = pi->size, ds;
69a22773
AG
1879 void *dig_in = peer_device->connection->int_dig_in;
1880 void *dig_vv = peer_device->connection->int_dig_vv;
6b4388ac 1881 unsigned long *data;
a0fb3c47 1882 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
f31e583a 1883 struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL;
9104d31a 1884 struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
b411b363 1885
11f8b2b6 1886 digest_size = 0;
a0fb3c47 1887 if (!trim && peer_device->connection->peer_integrity_tfm) {
3d0e6375 1888 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
9f5bdc33
AG
1889 /*
1890 * FIXME: Receive the incoming digest into the receive buffer
1891 * here, together with its struct p_data?
1892 */
11f8b2b6 1893 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
a5c31904 1894 if (err)
b411b363 1895 return NULL;
11f8b2b6 1896 data_size -= digest_size;
b411b363
PR
1897 }
1898
9104d31a
LE
1899 /* assume request_size == data_size, but special case trim and wsame. */
1900 ds = data_size;
a0fb3c47 1901 if (trim) {
9104d31a
LE
1902 if (!expect(data_size == 0))
1903 return NULL;
1904 ds = be32_to_cpu(trim->size);
f31e583a
LE
1905 } else if (zeroes) {
1906 if (!expect(data_size == 0))
1907 return NULL;
1908 ds = be32_to_cpu(zeroes->size);
9104d31a
LE
1909 } else if (wsame) {
1910 if (data_size != queue_logical_block_size(device->rq_queue)) {
1911 drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
1912 data_size, queue_logical_block_size(device->rq_queue));
1913 return NULL;
1914 }
1915 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
1916 drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
1917 data_size, bdev_logical_block_size(device->ldev->backing_bdev));
1918 return NULL;
1919 }
1920 ds = be32_to_cpu(wsame->size);
a0fb3c47
LE
1921 }
1922
9104d31a 1923 if (!expect(IS_ALIGNED(ds, 512)))
841ce241 1924 return NULL;
f31e583a 1925 if (trim || wsame || zeroes) {
9104d31a
LE
1926 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
1927 return NULL;
1928 } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
841ce241 1929 return NULL;
b411b363 1930
6666032a
LE
1931 /* even though we trust out peer,
1932 * we sometimes have to double check. */
9104d31a 1933 if (sector + (ds>>9) > capacity) {
d0180171 1934 drbd_err(device, "request from peer beyond end of local disk: "
fdda6544 1935 "capacity: %llus < sector: %llus + size: %u\n",
6666032a 1936 (unsigned long long)capacity,
9104d31a 1937 (unsigned long long)sector, ds);
6666032a
LE
1938 return NULL;
1939 }
1940
b411b363
PR
1941 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1942 * "criss-cross" setup, that might cause write-out on some other DRBD,
1943 * which in turn might block on the other node at this very place. */
9104d31a 1944 peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
db830c46 1945 if (!peer_req)
b411b363 1946 return NULL;
45bb912b 1947
21ae5d7f 1948 peer_req->flags |= EE_WRITE;
9104d31a 1949 if (trim) {
f31e583a
LE
1950 peer_req->flags |= EE_TRIM;
1951 return peer_req;
1952 }
1953 if (zeroes) {
1954 peer_req->flags |= EE_ZEROOUT;
81a3537a 1955 return peer_req;
9104d31a
LE
1956 }
1957 if (wsame)
1958 peer_req->flags |= EE_WRITE_SAME;
a73ff323 1959
9104d31a 1960 /* receive payload size bytes into page chain */
b411b363 1961 ds = data_size;
db830c46 1962 page = peer_req->pages;
45bb912b
LE
1963 page_chain_for_each(page) {
1964 unsigned len = min_t(int, ds, PAGE_SIZE);
6b4388ac 1965 data = kmap(page);
69a22773 1966 err = drbd_recv_all_warn(peer_device->connection, data, len);
b30ab791 1967 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
d0180171 1968 drbd_err(device, "Fault injection: Corrupting data on receive\n");
6b4388ac
PR
1969 data[0] = data[0] ^ (unsigned long)-1;
1970 }
b411b363 1971 kunmap(page);
a5c31904 1972 if (err) {
b30ab791 1973 drbd_free_peer_req(device, peer_req);
b411b363
PR
1974 return NULL;
1975 }
a5c31904 1976 ds -= len;
b411b363
PR
1977 }
1978
11f8b2b6 1979 if (digest_size) {
9104d31a 1980 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
11f8b2b6 1981 if (memcmp(dig_in, dig_vv, digest_size)) {
d0180171 1982 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
470be44a 1983 (unsigned long long)sector, data_size);
b30ab791 1984 drbd_free_peer_req(device, peer_req);
b411b363
PR
1985 return NULL;
1986 }
1987 }
11f8b2b6 1988 device->recv_cnt += data_size >> 9;
db830c46 1989 return peer_req;
b411b363
PR
1990}
1991
1992/* drbd_drain_block() just takes a data block
1993 * out of the socket input buffer, and discards it.
1994 */
69a22773 1995static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
b411b363
PR
1996{
1997 struct page *page;
a5c31904 1998 int err = 0;
b411b363
PR
1999 void *data;
2000
c3470cde 2001 if (!data_size)
fc5be839 2002 return 0;
c3470cde 2003
69a22773 2004 page = drbd_alloc_pages(peer_device, 1, 1);
b411b363
PR
2005
2006 data = kmap(page);
2007 while (data_size) {
fc5be839
AG
2008 unsigned int len = min_t(int, data_size, PAGE_SIZE);
2009
69a22773 2010 err = drbd_recv_all_warn(peer_device->connection, data, len);
a5c31904 2011 if (err)
b411b363 2012 break;
a5c31904 2013 data_size -= len;
b411b363
PR
2014 }
2015 kunmap(page);
69a22773 2016 drbd_free_pages(peer_device->device, page, 0);
fc5be839 2017 return err;
b411b363
PR
2018}
2019
69a22773 2020static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
b411b363
PR
2021 sector_t sector, int data_size)
2022{
7988613b
KO
2023 struct bio_vec bvec;
2024 struct bvec_iter iter;
b411b363 2025 struct bio *bio;
11f8b2b6 2026 int digest_size, err, expect;
69a22773
AG
2027 void *dig_in = peer_device->connection->int_dig_in;
2028 void *dig_vv = peer_device->connection->int_dig_vv;
b411b363 2029
11f8b2b6 2030 digest_size = 0;
69a22773 2031 if (peer_device->connection->peer_integrity_tfm) {
3d0e6375 2032 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
11f8b2b6 2033 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
a5c31904
AG
2034 if (err)
2035 return err;
11f8b2b6 2036 data_size -= digest_size;
b411b363
PR
2037 }
2038
b411b363
PR
2039 /* optimistically update recv_cnt. if receiving fails below,
2040 * we disconnect anyways, and counters will be reset. */
69a22773 2041 peer_device->device->recv_cnt += data_size>>9;
b411b363
PR
2042
2043 bio = req->master_bio;
69a22773 2044 D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
b411b363 2045
7988613b
KO
2046 bio_for_each_segment(bvec, bio, iter) {
2047 void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
2048 expect = min_t(int, data_size, bvec.bv_len);
69a22773 2049 err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
7988613b 2050 kunmap(bvec.bv_page);
a5c31904
AG
2051 if (err)
2052 return err;
2053 data_size -= expect;
b411b363
PR
2054 }
2055
11f8b2b6 2056 if (digest_size) {
69a22773 2057 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
11f8b2b6 2058 if (memcmp(dig_in, dig_vv, digest_size)) {
69a22773 2059 drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
28284cef 2060 return -EINVAL;
b411b363
PR
2061 }
2062 }
2063
69a22773 2064 D_ASSERT(peer_device->device, data_size == 0);
28284cef 2065 return 0;
b411b363
PR
2066}
2067
a990be46 2068/*
668700b4 2069 * e_end_resync_block() is called in ack_sender context via
a990be46
AG
2070 * drbd_finish_peer_reqs().
2071 */
99920dc5 2072static int e_end_resync_block(struct drbd_work *w, int unused)
b411b363 2073{
8050e6d0 2074 struct drbd_peer_request *peer_req =
a8cd15ba
AG
2075 container_of(w, struct drbd_peer_request, w);
2076 struct drbd_peer_device *peer_device = peer_req->peer_device;
2077 struct drbd_device *device = peer_device->device;
db830c46 2078 sector_t sector = peer_req->i.sector;
99920dc5 2079 int err;
b411b363 2080
0b0ba1ef 2081 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
b411b363 2082
db830c46 2083 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791 2084 drbd_set_in_sync(device, sector, peer_req->i.size);
a8cd15ba 2085 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
b411b363
PR
2086 } else {
2087 /* Record failure to sync */
b30ab791 2088 drbd_rs_failed_io(device, sector, peer_req->i.size);
b411b363 2089
a8cd15ba 2090 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
b411b363 2091 }
b30ab791 2092 dec_unacked(device);
b411b363 2093
99920dc5 2094 return err;
b411b363
PR
2095}
2096
69a22773 2097static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
a0fb3c47 2098 struct packet_info *pi) __releases(local)
b411b363 2099{
69a22773 2100 struct drbd_device *device = peer_device->device;
db830c46 2101 struct drbd_peer_request *peer_req;
b411b363 2102
a0fb3c47 2103 peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
db830c46 2104 if (!peer_req)
45bb912b 2105 goto fail;
b411b363 2106
b30ab791 2107 dec_rs_pending(device);
b411b363 2108
b30ab791 2109 inc_unacked(device);
b411b363
PR
2110 /* corresponding dec_unacked() in e_end_resync_block()
2111 * respective _drbd_clear_done_ee */
2112
a8cd15ba 2113 peer_req->w.cb = e_end_resync_block;
21ae5d7f 2114 peer_req->submit_jif = jiffies;
45bb912b 2115
0500813f 2116 spin_lock_irq(&device->resource->req_lock);
b9ed7080 2117 list_add_tail(&peer_req->w.list, &device->sync_ee);
0500813f 2118 spin_unlock_irq(&device->resource->req_lock);
b411b363 2119
a0fb3c47 2120 atomic_add(pi->size >> 9, &device->rs_sect_ev);
bb3cc85e
MC
2121 if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0,
2122 DRBD_FAULT_RS_WR) == 0)
e1c1b0fc 2123 return 0;
b411b363 2124
10f6d992 2125 /* don't care for the reason here */
d0180171 2126 drbd_err(device, "submit failed, triggering re-connect\n");
0500813f 2127 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2128 list_del(&peer_req->w.list);
0500813f 2129 spin_unlock_irq(&device->resource->req_lock);
22cc37a9 2130
b30ab791 2131 drbd_free_peer_req(device, peer_req);
45bb912b 2132fail:
b30ab791 2133 put_ldev(device);
e1c1b0fc 2134 return -EIO;
b411b363
PR
2135}
2136
668eebc6 2137static struct drbd_request *
b30ab791 2138find_request(struct drbd_device *device, struct rb_root *root, u64 id,
bc9c5c41 2139 sector_t sector, bool missing_ok, const char *func)
51624585 2140{
51624585
AG
2141 struct drbd_request *req;
2142
bc9c5c41
AG
2143 /* Request object according to our peer */
2144 req = (struct drbd_request *)(unsigned long)id;
5e472264 2145 if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
668eebc6 2146 return req;
c3afd8f5 2147 if (!missing_ok) {
d0180171 2148 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
c3afd8f5
AG
2149 (unsigned long)id, (unsigned long long)sector);
2150 }
51624585 2151 return NULL;
b411b363
PR
2152}
2153
bde89a9e 2154static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2155{
9f4fe9ad 2156 struct drbd_peer_device *peer_device;
b30ab791 2157 struct drbd_device *device;
b411b363
PR
2158 struct drbd_request *req;
2159 sector_t sector;
82bc0194 2160 int err;
e658983a 2161 struct p_data *p = pi->data;
4a76b161 2162
9f4fe9ad
AG
2163 peer_device = conn_peer_device(connection, pi->vnr);
2164 if (!peer_device)
4a76b161 2165 return -EIO;
9f4fe9ad 2166 device = peer_device->device;
b411b363
PR
2167
2168 sector = be64_to_cpu(p->sector);
2169
0500813f 2170 spin_lock_irq(&device->resource->req_lock);
b30ab791 2171 req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
0500813f 2172 spin_unlock_irq(&device->resource->req_lock);
c3afd8f5 2173 if (unlikely(!req))
82bc0194 2174 return -EIO;
b411b363 2175
24c4830c 2176 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
b411b363
PR
2177 * special casing it there for the various failure cases.
2178 * still no race with drbd_fail_pending_reads */
69a22773 2179 err = recv_dless_read(peer_device, req, sector, pi->size);
82bc0194 2180 if (!err)
8554df1c 2181 req_mod(req, DATA_RECEIVED);
b411b363
PR
2182 /* else: nothing. handled from drbd_disconnect...
2183 * I don't think we may complete this just yet
2184 * in case we are "on-disconnect: freeze" */
2185
82bc0194 2186 return err;
b411b363
PR
2187}
2188
bde89a9e 2189static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2190{
9f4fe9ad 2191 struct drbd_peer_device *peer_device;
b30ab791 2192 struct drbd_device *device;
b411b363 2193 sector_t sector;
82bc0194 2194 int err;
e658983a 2195 struct p_data *p = pi->data;
4a76b161 2196
9f4fe9ad
AG
2197 peer_device = conn_peer_device(connection, pi->vnr);
2198 if (!peer_device)
4a76b161 2199 return -EIO;
9f4fe9ad 2200 device = peer_device->device;
b411b363
PR
2201
2202 sector = be64_to_cpu(p->sector);
0b0ba1ef 2203 D_ASSERT(device, p->block_id == ID_SYNCER);
b411b363 2204
b30ab791 2205 if (get_ldev(device)) {
b411b363
PR
2206 /* data is submitted to disk within recv_resync_read.
2207 * corresponding put_ldev done below on error,
fcefa62e 2208 * or in drbd_peer_request_endio. */
a0fb3c47 2209 err = recv_resync_read(peer_device, sector, pi);
b411b363
PR
2210 } else {
2211 if (__ratelimit(&drbd_ratelimit_state))
d0180171 2212 drbd_err(device, "Can not write resync data to local disk.\n");
b411b363 2213
69a22773 2214 err = drbd_drain_block(peer_device, pi->size);
b411b363 2215
69a22773 2216 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
b411b363
PR
2217 }
2218
b30ab791 2219 atomic_add(pi->size >> 9, &device->rs_sect_in);
778f271d 2220
82bc0194 2221 return err;
b411b363
PR
2222}
2223
b30ab791 2224static void restart_conflicting_writes(struct drbd_device *device,
7be8da07 2225 sector_t sector, int size)
b411b363 2226{
7be8da07
AG
2227 struct drbd_interval *i;
2228 struct drbd_request *req;
2229
b30ab791 2230 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2231 if (!i->local)
2232 continue;
2233 req = container_of(i, struct drbd_request, i);
2234 if (req->rq_state & RQ_LOCAL_PENDING ||
2235 !(req->rq_state & RQ_POSTPONED))
2236 continue;
2312f0b3
LE
2237 /* as it is RQ_POSTPONED, this will cause it to
2238 * be queued on the retry workqueue. */
d4dabbe2 2239 __req_mod(req, CONFLICT_RESOLVED, NULL);
7be8da07
AG
2240 }
2241}
b411b363 2242
a990be46 2243/*
668700b4 2244 * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
b411b363 2245 */
99920dc5 2246static int e_end_block(struct drbd_work *w, int cancel)
b411b363 2247{
8050e6d0 2248 struct drbd_peer_request *peer_req =
a8cd15ba
AG
2249 container_of(w, struct drbd_peer_request, w);
2250 struct drbd_peer_device *peer_device = peer_req->peer_device;
2251 struct drbd_device *device = peer_device->device;
db830c46 2252 sector_t sector = peer_req->i.sector;
99920dc5 2253 int err = 0, pcmd;
b411b363 2254
303d1448 2255 if (peer_req->flags & EE_SEND_WRITE_ACK) {
db830c46 2256 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791
AG
2257 pcmd = (device->state.conn >= C_SYNC_SOURCE &&
2258 device->state.conn <= C_PAUSED_SYNC_T &&
db830c46 2259 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
b411b363 2260 P_RS_WRITE_ACK : P_WRITE_ACK;
a8cd15ba 2261 err = drbd_send_ack(peer_device, pcmd, peer_req);
b411b363 2262 if (pcmd == P_RS_WRITE_ACK)
b30ab791 2263 drbd_set_in_sync(device, sector, peer_req->i.size);
b411b363 2264 } else {
a8cd15ba 2265 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
b411b363
PR
2266 /* we expect it to be marked out of sync anyways...
2267 * maybe assert this? */
2268 }
b30ab791 2269 dec_unacked(device);
b411b363 2270 }
08d0dabf 2271
b411b363
PR
2272 /* we delete from the conflict detection hash _after_ we sent out the
2273 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
302bdeae 2274 if (peer_req->flags & EE_IN_INTERVAL_TREE) {
0500813f 2275 spin_lock_irq(&device->resource->req_lock);
0b0ba1ef 2276 D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
b30ab791 2277 drbd_remove_epoch_entry_interval(device, peer_req);
7be8da07 2278 if (peer_req->flags & EE_RESTART_REQUESTS)
b30ab791 2279 restart_conflicting_writes(device, sector, peer_req->i.size);
0500813f 2280 spin_unlock_irq(&device->resource->req_lock);
bb3bfe96 2281 } else
0b0ba1ef 2282 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
b411b363 2283
5dd2ca19 2284 drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
b411b363 2285
99920dc5 2286 return err;
b411b363
PR
2287}
2288
a8cd15ba 2289static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
b411b363 2290{
8050e6d0 2291 struct drbd_peer_request *peer_req =
a8cd15ba
AG
2292 container_of(w, struct drbd_peer_request, w);
2293 struct drbd_peer_device *peer_device = peer_req->peer_device;
99920dc5 2294 int err;
b411b363 2295
a8cd15ba
AG
2296 err = drbd_send_ack(peer_device, ack, peer_req);
2297 dec_unacked(peer_device->device);
b411b363 2298
99920dc5 2299 return err;
b411b363
PR
2300}
2301
d4dabbe2 2302static int e_send_superseded(struct drbd_work *w, int unused)
7be8da07 2303{
a8cd15ba 2304 return e_send_ack(w, P_SUPERSEDED);
7be8da07
AG
2305}
2306
99920dc5 2307static int e_send_retry_write(struct drbd_work *w, int unused)
7be8da07 2308{
a8cd15ba
AG
2309 struct drbd_peer_request *peer_req =
2310 container_of(w, struct drbd_peer_request, w);
2311 struct drbd_connection *connection = peer_req->peer_device->connection;
7be8da07 2312
a8cd15ba 2313 return e_send_ack(w, connection->agreed_pro_version >= 100 ?
d4dabbe2 2314 P_RETRY_WRITE : P_SUPERSEDED);
7be8da07 2315}
b411b363 2316
3e394da1
AG
2317static bool seq_greater(u32 a, u32 b)
2318{
2319 /*
2320 * We assume 32-bit wrap-around here.
2321 * For 24-bit wrap-around, we would have to shift:
2322 * a <<= 8; b <<= 8;
2323 */
2324 return (s32)a - (s32)b > 0;
2325}
b411b363 2326
3e394da1
AG
2327static u32 seq_max(u32 a, u32 b)
2328{
2329 return seq_greater(a, b) ? a : b;
b411b363
PR
2330}
2331
69a22773 2332static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
3e394da1 2333{
69a22773 2334 struct drbd_device *device = peer_device->device;
3c13b680 2335 unsigned int newest_peer_seq;
3e394da1 2336
69a22773 2337 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
b30ab791
AG
2338 spin_lock(&device->peer_seq_lock);
2339 newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2340 device->peer_seq = newest_peer_seq;
2341 spin_unlock(&device->peer_seq_lock);
2342 /* wake up only if we actually changed device->peer_seq */
3c13b680 2343 if (peer_seq == newest_peer_seq)
b30ab791 2344 wake_up(&device->seq_wait);
7be8da07 2345 }
b411b363
PR
2346}
2347
d93f6302 2348static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
b6a370ba 2349{
d93f6302
LE
2350 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2351}
b6a370ba 2352
d93f6302 2353/* maybe change sync_ee into interval trees as well? */
b30ab791 2354static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
d93f6302
LE
2355{
2356 struct drbd_peer_request *rs_req;
7e5fec31 2357 bool rv = false;
b6a370ba 2358
0500813f 2359 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2360 list_for_each_entry(rs_req, &device->sync_ee, w.list) {
d93f6302
LE
2361 if (overlaps(peer_req->i.sector, peer_req->i.size,
2362 rs_req->i.sector, rs_req->i.size)) {
7e5fec31 2363 rv = true;
b6a370ba
PR
2364 break;
2365 }
2366 }
0500813f 2367 spin_unlock_irq(&device->resource->req_lock);
b6a370ba
PR
2368
2369 return rv;
2370}
2371
b411b363
PR
2372/* Called from receive_Data.
2373 * Synchronize packets on sock with packets on msock.
2374 *
2375 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2376 * packet traveling on msock, they are still processed in the order they have
2377 * been sent.
2378 *
2379 * Note: we don't care for Ack packets overtaking P_DATA packets.
2380 *
b30ab791 2381 * In case packet_seq is larger than device->peer_seq number, there are
b411b363 2382 * outstanding packets on the msock. We wait for them to arrive.
b30ab791 2383 * In case we are the logically next packet, we update device->peer_seq
b411b363
PR
2384 * ourselves. Correctly handles 32bit wrap around.
2385 *
2386 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2387 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2388 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2389 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2390 *
2391 * returns 0 if we may process the packet,
2392 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
69a22773 2393static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
b411b363 2394{
69a22773 2395 struct drbd_device *device = peer_device->device;
b411b363 2396 DEFINE_WAIT(wait);
b411b363 2397 long timeout;
b874d231 2398 int ret = 0, tp;
7be8da07 2399
69a22773 2400 if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
7be8da07
AG
2401 return 0;
2402
b30ab791 2403 spin_lock(&device->peer_seq_lock);
b411b363 2404 for (;;) {
b30ab791
AG
2405 if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2406 device->peer_seq = seq_max(device->peer_seq, peer_seq);
b411b363 2407 break;
7be8da07 2408 }
b874d231 2409
b411b363
PR
2410 if (signal_pending(current)) {
2411 ret = -ERESTARTSYS;
2412 break;
2413 }
b874d231
PR
2414
2415 rcu_read_lock();
5dd2ca19 2416 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
b874d231
PR
2417 rcu_read_unlock();
2418
2419 if (!tp)
2420 break;
2421
2422 /* Only need to wait if two_primaries is enabled */
b30ab791
AG
2423 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2424 spin_unlock(&device->peer_seq_lock);
44ed167d 2425 rcu_read_lock();
69a22773 2426 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
44ed167d 2427 rcu_read_unlock();
71b1c1eb 2428 timeout = schedule_timeout(timeout);
b30ab791 2429 spin_lock(&device->peer_seq_lock);
7be8da07 2430 if (!timeout) {
b411b363 2431 ret = -ETIMEDOUT;
d0180171 2432 drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
b411b363
PR
2433 break;
2434 }
2435 }
b30ab791
AG
2436 spin_unlock(&device->peer_seq_lock);
2437 finish_wait(&device->seq_wait, &wait);
b411b363
PR
2438 return ret;
2439}
2440
688593c5
LE
2441/* see also bio_flags_to_wire()
2442 * DRBD_REQ_*, because we need to semantically map the flags to data packet
2443 * flags and back. We may replicate to other kernel versions. */
bb3cc85e 2444static unsigned long wire_flags_to_bio_flags(u32 dpf)
76d2e7ec 2445{
688593c5
LE
2446 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2447 (dpf & DP_FUA ? REQ_FUA : 0) |
28a8f0d3 2448 (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
bb3cc85e
MC
2449}
2450
2451static unsigned long wire_flags_to_bio_op(u32 dpf)
2452{
f31e583a 2453 if (dpf & DP_ZEROES)
45c21793 2454 return REQ_OP_WRITE_ZEROES;
f31e583a
LE
2455 if (dpf & DP_DISCARD)
2456 return REQ_OP_DISCARD;
2457 if (dpf & DP_WSAME)
2458 return REQ_OP_WRITE_SAME;
bb3cc85e
MC
2459 else
2460 return REQ_OP_WRITE;
76d2e7ec
PR
2461}
2462
b30ab791 2463static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
7be8da07
AG
2464 unsigned int size)
2465{
2466 struct drbd_interval *i;
2467
2468 repeat:
b30ab791 2469 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2470 struct drbd_request *req;
2471 struct bio_and_error m;
2472
2473 if (!i->local)
2474 continue;
2475 req = container_of(i, struct drbd_request, i);
2476 if (!(req->rq_state & RQ_POSTPONED))
2477 continue;
2478 req->rq_state &= ~RQ_POSTPONED;
2479 __req_mod(req, NEG_ACKED, &m);
0500813f 2480 spin_unlock_irq(&device->resource->req_lock);
7be8da07 2481 if (m.bio)
b30ab791 2482 complete_master_bio(device, &m);
0500813f 2483 spin_lock_irq(&device->resource->req_lock);
7be8da07
AG
2484 goto repeat;
2485 }
2486}
2487
b30ab791 2488static int handle_write_conflicts(struct drbd_device *device,
7be8da07
AG
2489 struct drbd_peer_request *peer_req)
2490{
e33b32de 2491 struct drbd_connection *connection = peer_req->peer_device->connection;
bde89a9e 2492 bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
7be8da07
AG
2493 sector_t sector = peer_req->i.sector;
2494 const unsigned int size = peer_req->i.size;
2495 struct drbd_interval *i;
2496 bool equal;
2497 int err;
2498
2499 /*
2500 * Inserting the peer request into the write_requests tree will prevent
2501 * new conflicting local requests from being added.
2502 */
b30ab791 2503 drbd_insert_interval(&device->write_requests, &peer_req->i);
7be8da07
AG
2504
2505 repeat:
b30ab791 2506 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2507 if (i == &peer_req->i)
2508 continue;
08d0dabf
LE
2509 if (i->completed)
2510 continue;
7be8da07
AG
2511
2512 if (!i->local) {
2513 /*
2514 * Our peer has sent a conflicting remote request; this
2515 * should not happen in a two-node setup. Wait for the
2516 * earlier peer request to complete.
2517 */
b30ab791 2518 err = drbd_wait_misc(device, i);
7be8da07
AG
2519 if (err)
2520 goto out;
2521 goto repeat;
2522 }
2523
2524 equal = i->sector == sector && i->size == size;
2525 if (resolve_conflicts) {
2526 /*
2527 * If the peer request is fully contained within the
d4dabbe2
LE
2528 * overlapping request, it can be considered overwritten
2529 * and thus superseded; otherwise, it will be retried
2530 * once all overlapping requests have completed.
7be8da07 2531 */
d4dabbe2 2532 bool superseded = i->sector <= sector && i->sector +
7be8da07
AG
2533 (i->size >> 9) >= sector + (size >> 9);
2534
2535 if (!equal)
d0180171 2536 drbd_alert(device, "Concurrent writes detected: "
7be8da07
AG
2537 "local=%llus +%u, remote=%llus +%u, "
2538 "assuming %s came first\n",
2539 (unsigned long long)i->sector, i->size,
2540 (unsigned long long)sector, size,
d4dabbe2 2541 superseded ? "local" : "remote");
7be8da07 2542
a8cd15ba 2543 peer_req->w.cb = superseded ? e_send_superseded :
7be8da07 2544 e_send_retry_write;
a8cd15ba 2545 list_add_tail(&peer_req->w.list, &device->done_ee);
668700b4 2546 queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
7be8da07
AG
2547
2548 err = -ENOENT;
2549 goto out;
2550 } else {
2551 struct drbd_request *req =
2552 container_of(i, struct drbd_request, i);
2553
2554 if (!equal)
d0180171 2555 drbd_alert(device, "Concurrent writes detected: "
7be8da07
AG
2556 "local=%llus +%u, remote=%llus +%u\n",
2557 (unsigned long long)i->sector, i->size,
2558 (unsigned long long)sector, size);
2559
2560 if (req->rq_state & RQ_LOCAL_PENDING ||
2561 !(req->rq_state & RQ_POSTPONED)) {
2562 /*
2563 * Wait for the node with the discard flag to
d4dabbe2
LE
2564 * decide if this request has been superseded
2565 * or needs to be retried.
2566 * Requests that have been superseded will
7be8da07
AG
2567 * disappear from the write_requests tree.
2568 *
2569 * In addition, wait for the conflicting
2570 * request to finish locally before submitting
2571 * the conflicting peer request.
2572 */
b30ab791 2573 err = drbd_wait_misc(device, &req->i);
7be8da07 2574 if (err) {
e33b32de 2575 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
b30ab791 2576 fail_postponed_requests(device, sector, size);
7be8da07
AG
2577 goto out;
2578 }
2579 goto repeat;
2580 }
2581 /*
2582 * Remember to restart the conflicting requests after
2583 * the new peer request has completed.
2584 */
2585 peer_req->flags |= EE_RESTART_REQUESTS;
2586 }
2587 }
2588 err = 0;
2589
2590 out:
2591 if (err)
b30ab791 2592 drbd_remove_epoch_entry_interval(device, peer_req);
7be8da07
AG
2593 return err;
2594}
2595
b411b363 2596/* mirrored write */
bde89a9e 2597static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2598{
9f4fe9ad 2599 struct drbd_peer_device *peer_device;
b30ab791 2600 struct drbd_device *device;
21ae5d7f 2601 struct net_conf *nc;
b411b363 2602 sector_t sector;
db830c46 2603 struct drbd_peer_request *peer_req;
e658983a 2604 struct p_data *p = pi->data;
7be8da07 2605 u32 peer_seq = be32_to_cpu(p->seq_num);
bb3cc85e 2606 int op, op_flags;
b411b363 2607 u32 dp_flags;
302bdeae 2608 int err, tp;
b411b363 2609
9f4fe9ad
AG
2610 peer_device = conn_peer_device(connection, pi->vnr);
2611 if (!peer_device)
4a76b161 2612 return -EIO;
9f4fe9ad 2613 device = peer_device->device;
b411b363 2614
b30ab791 2615 if (!get_ldev(device)) {
82bc0194
AG
2616 int err2;
2617
69a22773
AG
2618 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2619 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
bde89a9e 2620 atomic_inc(&connection->current_epoch->epoch_size);
69a22773 2621 err2 = drbd_drain_block(peer_device, pi->size);
82bc0194
AG
2622 if (!err)
2623 err = err2;
2624 return err;
b411b363
PR
2625 }
2626
fcefa62e
AG
2627 /*
2628 * Corresponding put_ldev done either below (on various errors), or in
2629 * drbd_peer_request_endio, if we successfully submit the data at the
2630 * end of this function.
2631 */
b411b363
PR
2632
2633 sector = be64_to_cpu(p->sector);
a0fb3c47 2634 peer_req = read_in_block(peer_device, p->block_id, sector, pi);
db830c46 2635 if (!peer_req) {
b30ab791 2636 put_ldev(device);
82bc0194 2637 return -EIO;
b411b363
PR
2638 }
2639
a8cd15ba 2640 peer_req->w.cb = e_end_block;
21ae5d7f
LE
2641 peer_req->submit_jif = jiffies;
2642 peer_req->flags |= EE_APPLICATION;
b411b363 2643
688593c5 2644 dp_flags = be32_to_cpu(p->dp_flags);
bb3cc85e
MC
2645 op = wire_flags_to_bio_op(dp_flags);
2646 op_flags = wire_flags_to_bio_flags(dp_flags);
a0fb3c47 2647 if (pi->cmd == P_TRIM) {
f31e583a
LE
2648 D_ASSERT(peer_device, peer_req->i.size > 0);
2649 D_ASSERT(peer_device, op == REQ_OP_DISCARD);
2650 D_ASSERT(peer_device, peer_req->pages == NULL);
2651 /* need to play safe: an older DRBD sender
2652 * may mean zero-out while sending P_TRIM. */
2653 if (0 == (connection->agreed_features & DRBD_FF_WZEROES))
2654 peer_req->flags |= EE_ZEROOUT;
2655 } else if (pi->cmd == P_ZEROES) {
a0fb3c47 2656 D_ASSERT(peer_device, peer_req->i.size > 0);
45c21793 2657 D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
a0fb3c47 2658 D_ASSERT(peer_device, peer_req->pages == NULL);
f31e583a
LE
2659 /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
2660 if (dp_flags & DP_DISCARD)
2661 peer_req->flags |= EE_TRIM;
a0fb3c47 2662 } else if (peer_req->pages == NULL) {
0b0ba1ef
AG
2663 D_ASSERT(device, peer_req->i.size == 0);
2664 D_ASSERT(device, dp_flags & DP_FLUSH);
a73ff323 2665 }
688593c5
LE
2666
2667 if (dp_flags & DP_MAY_SET_IN_SYNC)
db830c46 2668 peer_req->flags |= EE_MAY_SET_IN_SYNC;
688593c5 2669
bde89a9e
AG
2670 spin_lock(&connection->epoch_lock);
2671 peer_req->epoch = connection->current_epoch;
db830c46
AG
2672 atomic_inc(&peer_req->epoch->epoch_size);
2673 atomic_inc(&peer_req->epoch->active);
bde89a9e 2674 spin_unlock(&connection->epoch_lock);
b411b363 2675
302bdeae 2676 rcu_read_lock();
21ae5d7f
LE
2677 nc = rcu_dereference(peer_device->connection->net_conf);
2678 tp = nc->two_primaries;
2679 if (peer_device->connection->agreed_pro_version < 100) {
2680 switch (nc->wire_protocol) {
2681 case DRBD_PROT_C:
2682 dp_flags |= DP_SEND_WRITE_ACK;
2683 break;
2684 case DRBD_PROT_B:
2685 dp_flags |= DP_SEND_RECEIVE_ACK;
2686 break;
2687 }
2688 }
302bdeae 2689 rcu_read_unlock();
21ae5d7f
LE
2690
2691 if (dp_flags & DP_SEND_WRITE_ACK) {
2692 peer_req->flags |= EE_SEND_WRITE_ACK;
2693 inc_unacked(device);
2694 /* corresponding dec_unacked() in e_end_block()
2695 * respective _drbd_clear_done_ee */
2696 }
2697
2698 if (dp_flags & DP_SEND_RECEIVE_ACK) {
2699 /* I really don't like it that the receiver thread
2700 * sends on the msock, but anyways */
5dd2ca19 2701 drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
21ae5d7f
LE
2702 }
2703
302bdeae 2704 if (tp) {
21ae5d7f
LE
2705 /* two primaries implies protocol C */
2706 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
302bdeae 2707 peer_req->flags |= EE_IN_INTERVAL_TREE;
69a22773 2708 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
7be8da07 2709 if (err)
b411b363 2710 goto out_interrupted;
0500813f 2711 spin_lock_irq(&device->resource->req_lock);
b30ab791 2712 err = handle_write_conflicts(device, peer_req);
7be8da07 2713 if (err) {
0500813f 2714 spin_unlock_irq(&device->resource->req_lock);
7be8da07 2715 if (err == -ENOENT) {
b30ab791 2716 put_ldev(device);
82bc0194 2717 return 0;
b411b363 2718 }
7be8da07 2719 goto out_interrupted;
b411b363 2720 }
b874d231 2721 } else {
69a22773 2722 update_peer_seq(peer_device, peer_seq);
0500813f 2723 spin_lock_irq(&device->resource->req_lock);
b874d231 2724 }
9104d31a
LE
2725 /* TRIM and WRITE_SAME are processed synchronously,
2726 * we wait for all pending requests, respectively wait for
a0fb3c47
LE
2727 * active_ee to become empty in drbd_submit_peer_request();
2728 * better not add ourselves here. */
f31e583a 2729 if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0)
b9ed7080 2730 list_add_tail(&peer_req->w.list, &device->active_ee);
0500813f 2731 spin_unlock_irq(&device->resource->req_lock);
b411b363 2732
b30ab791
AG
2733 if (device->state.conn == C_SYNC_TARGET)
2734 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
b411b363 2735
b30ab791 2736 if (device->state.pdsk < D_INCONSISTENT) {
b411b363 2737 /* In case we have the only disk of the cluster, */
b30ab791 2738 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
db830c46 2739 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
4dd726f0 2740 drbd_al_begin_io(device, &peer_req->i);
21ae5d7f 2741 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
b411b363
PR
2742 }
2743
bb3cc85e
MC
2744 err = drbd_submit_peer_request(device, peer_req, op, op_flags,
2745 DRBD_FAULT_DT_WR);
82bc0194
AG
2746 if (!err)
2747 return 0;
b411b363 2748
10f6d992 2749 /* don't care for the reason here */
d0180171 2750 drbd_err(device, "submit failed, triggering re-connect\n");
0500813f 2751 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2752 list_del(&peer_req->w.list);
b30ab791 2753 drbd_remove_epoch_entry_interval(device, peer_req);
0500813f 2754 spin_unlock_irq(&device->resource->req_lock);
21ae5d7f
LE
2755 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2756 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
b30ab791 2757 drbd_al_complete_io(device, &peer_req->i);
21ae5d7f 2758 }
22cc37a9 2759
b411b363 2760out_interrupted:
7e5fec31 2761 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
b30ab791
AG
2762 put_ldev(device);
2763 drbd_free_peer_req(device, peer_req);
82bc0194 2764 return err;
b411b363
PR
2765}
2766
0f0601f4
LE
2767/* We may throttle resync, if the lower device seems to be busy,
2768 * and current sync rate is above c_min_rate.
2769 *
2770 * To decide whether or not the lower device is busy, we use a scheme similar
2771 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2772 * (more than 64 sectors) of activity we cannot account for with our own resync
2773 * activity, it obviously is "busy".
2774 *
2775 * The current sync rate used here uses only the most recent two step marks,
2776 * to have a short time average so we can react faster.
2777 */
ad3fee79
LE
2778bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2779 bool throttle_if_app_is_waiting)
0f0601f4 2780{
e3555d85 2781 struct lc_element *tmp;
ad3fee79 2782 bool throttle = drbd_rs_c_min_rate_throttle(device);
daeda1cc 2783
ad3fee79
LE
2784 if (!throttle || throttle_if_app_is_waiting)
2785 return throttle;
0f0601f4 2786
b30ab791
AG
2787 spin_lock_irq(&device->al_lock);
2788 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
e3555d85
PR
2789 if (tmp) {
2790 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
e8299874
LE
2791 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2792 throttle = false;
ad3fee79
LE
2793 /* Do not slow down if app IO is already waiting for this extent,
2794 * and our progress is necessary for application IO to complete. */
e3555d85 2795 }
b30ab791 2796 spin_unlock_irq(&device->al_lock);
e3555d85 2797
e8299874
LE
2798 return throttle;
2799}
2800
2801bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2802{
2803 struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2804 unsigned long db, dt, dbdt;
2805 unsigned int c_min_rate;
2806 int curr_events;
2807
2808 rcu_read_lock();
2809 c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2810 rcu_read_unlock();
2811
2812 /* feature disabled? */
2813 if (c_min_rate == 0)
2814 return false;
2815
59767fbd 2816 curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
b30ab791 2817 atomic_read(&device->rs_sect_ev);
ad3fee79
LE
2818
2819 if (atomic_read(&device->ap_actlog_cnt)
ff8bd88b 2820 || curr_events - device->rs_last_events > 64) {
0f0601f4
LE
2821 unsigned long rs_left;
2822 int i;
2823
b30ab791 2824 device->rs_last_events = curr_events;
0f0601f4
LE
2825
2826 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2827 * approx. */
b30ab791 2828 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2649f080 2829
b30ab791
AG
2830 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2831 rs_left = device->ov_left;
2649f080 2832 else
b30ab791 2833 rs_left = drbd_bm_total_weight(device) - device->rs_failed;
0f0601f4 2834
b30ab791 2835 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
0f0601f4
LE
2836 if (!dt)
2837 dt++;
b30ab791 2838 db = device->rs_mark_left[i] - rs_left;
0f0601f4
LE
2839 dbdt = Bit2KB(db/dt);
2840
daeda1cc 2841 if (dbdt > c_min_rate)
e8299874 2842 return true;
0f0601f4 2843 }
e8299874 2844 return false;
0f0601f4
LE
2845}
2846
bde89a9e 2847static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2848{
9f4fe9ad 2849 struct drbd_peer_device *peer_device;
b30ab791 2850 struct drbd_device *device;
b411b363 2851 sector_t sector;
4a76b161 2852 sector_t capacity;
db830c46 2853 struct drbd_peer_request *peer_req;
b411b363 2854 struct digest_info *di = NULL;
b18b37be 2855 int size, verb;
b411b363 2856 unsigned int fault_type;
e658983a 2857 struct p_block_req *p = pi->data;
4a76b161 2858
9f4fe9ad
AG
2859 peer_device = conn_peer_device(connection, pi->vnr);
2860 if (!peer_device)
4a76b161 2861 return -EIO;
9f4fe9ad 2862 device = peer_device->device;
b30ab791 2863 capacity = drbd_get_capacity(device->this_bdev);
b411b363
PR
2864
2865 sector = be64_to_cpu(p->sector);
2866 size = be32_to_cpu(p->blksize);
2867
c670a398 2868 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
d0180171 2869 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
b411b363 2870 (unsigned long long)sector, size);
82bc0194 2871 return -EINVAL;
b411b363
PR
2872 }
2873 if (sector + (size>>9) > capacity) {
d0180171 2874 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
b411b363 2875 (unsigned long long)sector, size);
82bc0194 2876 return -EINVAL;
b411b363
PR
2877 }
2878
b30ab791 2879 if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
b18b37be 2880 verb = 1;
e2857216 2881 switch (pi->cmd) {
b18b37be 2882 case P_DATA_REQUEST:
69a22773 2883 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
b18b37be 2884 break;
700ca8c0 2885 case P_RS_THIN_REQ:
b18b37be
PR
2886 case P_RS_DATA_REQUEST:
2887 case P_CSUM_RS_REQUEST:
2888 case P_OV_REQUEST:
69a22773 2889 drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
b18b37be
PR
2890 break;
2891 case P_OV_REPLY:
2892 verb = 0;
b30ab791 2893 dec_rs_pending(device);
69a22773 2894 drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
b18b37be
PR
2895 break;
2896 default:
49ba9b1b 2897 BUG();
b18b37be
PR
2898 }
2899 if (verb && __ratelimit(&drbd_ratelimit_state))
d0180171 2900 drbd_err(device, "Can not satisfy peer's read request, "
b411b363 2901 "no local data.\n");
b18b37be 2902
a821cc4a 2903 /* drain possibly payload */
69a22773 2904 return drbd_drain_block(peer_device, pi->size);
b411b363
PR
2905 }
2906
2907 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2908 * "criss-cross" setup, that might cause write-out on some other DRBD,
2909 * which in turn might block on the other node at this very place. */
a0fb3c47 2910 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
9104d31a 2911 size, GFP_NOIO);
db830c46 2912 if (!peer_req) {
b30ab791 2913 put_ldev(device);
82bc0194 2914 return -ENOMEM;
b411b363
PR
2915 }
2916
e2857216 2917 switch (pi->cmd) {
b411b363 2918 case P_DATA_REQUEST:
a8cd15ba 2919 peer_req->w.cb = w_e_end_data_req;
b411b363 2920 fault_type = DRBD_FAULT_DT_RD;
80a40e43 2921 /* application IO, don't drbd_rs_begin_io */
21ae5d7f 2922 peer_req->flags |= EE_APPLICATION;
80a40e43
LE
2923 goto submit;
2924
700ca8c0
PR
2925 case P_RS_THIN_REQ:
2926 /* If at some point in the future we have a smart way to
2927 find out if this data block is completely deallocated,
2928 then we would do something smarter here than reading
2929 the block... */
2930 peer_req->flags |= EE_RS_THIN_REQ;
d769a992 2931 /* fall through */
b411b363 2932 case P_RS_DATA_REQUEST:
a8cd15ba 2933 peer_req->w.cb = w_e_end_rsdata_req;
b411b363 2934 fault_type = DRBD_FAULT_RS_RD;
5f9915bb 2935 /* used in the sector offset progress display */
b30ab791 2936 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
b411b363
PR
2937 break;
2938
2939 case P_OV_REPLY:
2940 case P_CSUM_RS_REQUEST:
2941 fault_type = DRBD_FAULT_RS_RD;
e2857216 2942 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
b411b363
PR
2943 if (!di)
2944 goto out_free_e;
2945
e2857216 2946 di->digest_size = pi->size;
b411b363
PR
2947 di->digest = (((char *)di)+sizeof(struct digest_info));
2948
db830c46
AG
2949 peer_req->digest = di;
2950 peer_req->flags |= EE_HAS_DIGEST;
c36c3ced 2951
9f4fe9ad 2952 if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
b411b363
PR
2953 goto out_free_e;
2954
e2857216 2955 if (pi->cmd == P_CSUM_RS_REQUEST) {
9f4fe9ad 2956 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
a8cd15ba 2957 peer_req->w.cb = w_e_end_csum_rs_req;
5f9915bb 2958 /* used in the sector offset progress display */
b30ab791 2959 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
aaaba345
LE
2960 /* remember to report stats in drbd_resync_finished */
2961 device->use_csums = true;
e2857216 2962 } else if (pi->cmd == P_OV_REPLY) {
2649f080 2963 /* track progress, we may need to throttle */
b30ab791 2964 atomic_add(size >> 9, &device->rs_sect_in);
a8cd15ba 2965 peer_req->w.cb = w_e_end_ov_reply;
b30ab791 2966 dec_rs_pending(device);
0f0601f4
LE
2967 /* drbd_rs_begin_io done when we sent this request,
2968 * but accounting still needs to be done. */
2969 goto submit_for_resync;
b411b363
PR
2970 }
2971 break;
2972
2973 case P_OV_REQUEST:
b30ab791 2974 if (device->ov_start_sector == ~(sector_t)0 &&
9f4fe9ad 2975 peer_device->connection->agreed_pro_version >= 90) {
de228bba
LE
2976 unsigned long now = jiffies;
2977 int i;
b30ab791
AG
2978 device->ov_start_sector = sector;
2979 device->ov_position = sector;
2980 device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2981 device->rs_total = device->ov_left;
de228bba 2982 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
b30ab791
AG
2983 device->rs_mark_left[i] = device->ov_left;
2984 device->rs_mark_time[i] = now;
de228bba 2985 }
d0180171 2986 drbd_info(device, "Online Verify start sector: %llu\n",
b411b363
PR
2987 (unsigned long long)sector);
2988 }
a8cd15ba 2989 peer_req->w.cb = w_e_end_ov_req;
b411b363 2990 fault_type = DRBD_FAULT_RS_RD;
b411b363
PR
2991 break;
2992
b411b363 2993 default:
49ba9b1b 2994 BUG();
b411b363
PR
2995 }
2996
0f0601f4
LE
2997 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2998 * wrt the receiver, but it is not as straightforward as it may seem.
2999 * Various places in the resync start and stop logic assume resync
3000 * requests are processed in order, requeuing this on the worker thread
3001 * introduces a bunch of new code for synchronization between threads.
3002 *
3003 * Unlimited throttling before drbd_rs_begin_io may stall the resync
3004 * "forever", throttling after drbd_rs_begin_io will lock that extent
3005 * for application writes for the same time. For now, just throttle
3006 * here, where the rest of the code expects the receiver to sleep for
3007 * a while, anyways.
3008 */
3009
3010 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
3011 * this defers syncer requests for some time, before letting at least
3012 * on request through. The resync controller on the receiving side
3013 * will adapt to the incoming rate accordingly.
3014 *
3015 * We cannot throttle here if remote is Primary/SyncTarget:
3016 * we would also throttle its application reads.
3017 * In that case, throttling is done on the SyncTarget only.
3018 */
c5a2c150
LE
3019
3020 /* Even though this may be a resync request, we do add to "read_ee";
3021 * "sync_ee" is only used for resync WRITEs.
3022 * Add to list early, so debugfs can find this request
3023 * even if we have to sleep below. */
3024 spin_lock_irq(&device->resource->req_lock);
3025 list_add_tail(&peer_req->w.list, &device->read_ee);
3026 spin_unlock_irq(&device->resource->req_lock);
3027
944410e9 3028 update_receiver_timing_details(connection, drbd_rs_should_slow_down);
ad3fee79
LE
3029 if (device->state.peer != R_PRIMARY
3030 && drbd_rs_should_slow_down(device, sector, false))
e3555d85 3031 schedule_timeout_uninterruptible(HZ/10);
944410e9 3032 update_receiver_timing_details(connection, drbd_rs_begin_io);
b30ab791 3033 if (drbd_rs_begin_io(device, sector))
80a40e43 3034 goto out_free_e;
b411b363 3035
0f0601f4 3036submit_for_resync:
b30ab791 3037 atomic_add(size >> 9, &device->rs_sect_ev);
0f0601f4 3038
80a40e43 3039submit:
944410e9 3040 update_receiver_timing_details(connection, drbd_submit_peer_request);
b30ab791 3041 inc_unacked(device);
bb3cc85e
MC
3042 if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
3043 fault_type) == 0)
82bc0194 3044 return 0;
b411b363 3045
10f6d992 3046 /* don't care for the reason here */
d0180171 3047 drbd_err(device, "submit failed, triggering re-connect\n");
c5a2c150
LE
3048
3049out_free_e:
0500813f 3050 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 3051 list_del(&peer_req->w.list);
0500813f 3052 spin_unlock_irq(&device->resource->req_lock);
22cc37a9
LE
3053 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
3054
b30ab791
AG
3055 put_ldev(device);
3056 drbd_free_peer_req(device, peer_req);
82bc0194 3057 return -EIO;
b411b363
PR
3058}
3059
69a22773
AG
3060/**
3061 * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries
3062 */
3063static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 3064{
69a22773 3065 struct drbd_device *device = peer_device->device;
b411b363
PR
3066 int self, peer, rv = -100;
3067 unsigned long ch_self, ch_peer;
44ed167d 3068 enum drbd_after_sb_p after_sb_0p;
b411b363 3069
b30ab791
AG
3070 self = device->ldev->md.uuid[UI_BITMAP] & 1;
3071 peer = device->p_uuid[UI_BITMAP] & 1;
b411b363 3072
b30ab791
AG
3073 ch_peer = device->p_uuid[UI_SIZE];
3074 ch_self = device->comm_bm_set;
b411b363 3075
44ed167d 3076 rcu_read_lock();
69a22773 3077 after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
44ed167d
PR
3078 rcu_read_unlock();
3079 switch (after_sb_0p) {
b411b363
PR
3080 case ASB_CONSENSUS:
3081 case ASB_DISCARD_SECONDARY:
3082 case ASB_CALL_HELPER:
44ed167d 3083 case ASB_VIOLENTLY:
d0180171 3084 drbd_err(device, "Configuration error.\n");
b411b363
PR
3085 break;
3086 case ASB_DISCONNECT:
3087 break;
3088 case ASB_DISCARD_YOUNGER_PRI:
3089 if (self == 0 && peer == 1) {
3090 rv = -1;
3091 break;
3092 }
3093 if (self == 1 && peer == 0) {
3094 rv = 1;
3095 break;
3096 }
3097 /* Else fall through to one of the other strategies... */
3098 case ASB_DISCARD_OLDER_PRI:
3099 if (self == 0 && peer == 1) {
3100 rv = 1;
3101 break;
3102 }
3103 if (self == 1 && peer == 0) {
3104 rv = -1;
3105 break;
3106 }
3107 /* Else fall through to one of the other strategies... */
d0180171 3108 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
b411b363 3109 "Using discard-least-changes instead\n");
d769a992 3110 /* fall through */
b411b363
PR
3111 case ASB_DISCARD_ZERO_CHG:
3112 if (ch_peer == 0 && ch_self == 0) {
69a22773 3113 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
b411b363
PR
3114 ? -1 : 1;
3115 break;
3116 } else {
3117 if (ch_peer == 0) { rv = 1; break; }
3118 if (ch_self == 0) { rv = -1; break; }
3119 }
44ed167d 3120 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
b411b363 3121 break;
d769a992 3122 /* else: fall through */
b411b363
PR
3123 case ASB_DISCARD_LEAST_CHG:
3124 if (ch_self < ch_peer)
3125 rv = -1;
3126 else if (ch_self > ch_peer)
3127 rv = 1;
3128 else /* ( ch_self == ch_peer ) */
3129 /* Well, then use something else. */
69a22773 3130 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
b411b363
PR
3131 ? -1 : 1;
3132 break;
3133 case ASB_DISCARD_LOCAL:
3134 rv = -1;
3135 break;
3136 case ASB_DISCARD_REMOTE:
3137 rv = 1;
3138 }
3139
3140 return rv;
3141}
3142
69a22773
AG
3143/**
3144 * drbd_asb_recover_1p - Recover after split-brain with one remaining primary
3145 */
3146static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 3147{
69a22773 3148 struct drbd_device *device = peer_device->device;
6184ea21 3149 int hg, rv = -100;
44ed167d 3150 enum drbd_after_sb_p after_sb_1p;
b411b363 3151
44ed167d 3152 rcu_read_lock();
69a22773 3153 after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
44ed167d
PR
3154 rcu_read_unlock();
3155 switch (after_sb_1p) {
b411b363
PR
3156 case ASB_DISCARD_YOUNGER_PRI:
3157 case ASB_DISCARD_OLDER_PRI:
3158 case ASB_DISCARD_LEAST_CHG:
3159 case ASB_DISCARD_LOCAL:
3160 case ASB_DISCARD_REMOTE:
44ed167d 3161 case ASB_DISCARD_ZERO_CHG:
d0180171 3162 drbd_err(device, "Configuration error.\n");
b411b363
PR
3163 break;
3164 case ASB_DISCONNECT:
3165 break;
3166 case ASB_CONSENSUS:
69a22773 3167 hg = drbd_asb_recover_0p(peer_device);
b30ab791 3168 if (hg == -1 && device->state.role == R_SECONDARY)
b411b363 3169 rv = hg;
b30ab791 3170 if (hg == 1 && device->state.role == R_PRIMARY)
b411b363
PR
3171 rv = hg;
3172 break;
3173 case ASB_VIOLENTLY:
69a22773 3174 rv = drbd_asb_recover_0p(peer_device);
b411b363
PR
3175 break;
3176 case ASB_DISCARD_SECONDARY:
b30ab791 3177 return device->state.role == R_PRIMARY ? 1 : -1;
b411b363 3178 case ASB_CALL_HELPER:
69a22773 3179 hg = drbd_asb_recover_0p(peer_device);
b30ab791 3180 if (hg == -1 && device->state.role == R_PRIMARY) {
bb437946
AG
3181 enum drbd_state_rv rv2;
3182
b411b363
PR
3183 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3184 * we might be here in C_WF_REPORT_PARAMS which is transient.
3185 * we do not need to wait for the after state change work either. */
b30ab791 3186 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
bb437946 3187 if (rv2 != SS_SUCCESS) {
b30ab791 3188 drbd_khelper(device, "pri-lost-after-sb");
b411b363 3189 } else {
d0180171 3190 drbd_warn(device, "Successfully gave up primary role.\n");
b411b363
PR
3191 rv = hg;
3192 }
3193 } else
3194 rv = hg;
3195 }
3196
3197 return rv;
3198}
3199
69a22773
AG
3200/**
3201 * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries
3202 */
3203static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 3204{
69a22773 3205 struct drbd_device *device = peer_device->device;
6184ea21 3206 int hg, rv = -100;
44ed167d 3207 enum drbd_after_sb_p after_sb_2p;
b411b363 3208
44ed167d 3209 rcu_read_lock();
69a22773 3210 after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
44ed167d
PR
3211 rcu_read_unlock();
3212 switch (after_sb_2p) {
b411b363
PR
3213 case ASB_DISCARD_YOUNGER_PRI:
3214 case ASB_DISCARD_OLDER_PRI:
3215 case ASB_DISCARD_LEAST_CHG:
3216 case ASB_DISCARD_LOCAL:
3217 case ASB_DISCARD_REMOTE:
3218 case ASB_CONSENSUS:
3219 case ASB_DISCARD_SECONDARY:
44ed167d 3220 case ASB_DISCARD_ZERO_CHG:
d0180171 3221 drbd_err(device, "Configuration error.\n");
b411b363
PR
3222 break;
3223 case ASB_VIOLENTLY:
69a22773 3224 rv = drbd_asb_recover_0p(peer_device);
b411b363
PR
3225 break;
3226 case ASB_DISCONNECT:
3227 break;
3228 case ASB_CALL_HELPER:
69a22773 3229 hg = drbd_asb_recover_0p(peer_device);
b411b363 3230 if (hg == -1) {
bb437946
AG
3231 enum drbd_state_rv rv2;
3232
b411b363
PR
3233 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3234 * we might be here in C_WF_REPORT_PARAMS which is transient.
3235 * we do not need to wait for the after state change work either. */
b30ab791 3236 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
bb437946 3237 if (rv2 != SS_SUCCESS) {
b30ab791 3238 drbd_khelper(device, "pri-lost-after-sb");
b411b363 3239 } else {
d0180171 3240 drbd_warn(device, "Successfully gave up primary role.\n");
b411b363
PR
3241 rv = hg;
3242 }
3243 } else
3244 rv = hg;
3245 }
3246
3247 return rv;
3248}
3249
b30ab791 3250static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
b411b363
PR
3251 u64 bits, u64 flags)
3252{
3253 if (!uuid) {
d0180171 3254 drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
b411b363
PR
3255 return;
3256 }
d0180171 3257 drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
b411b363
PR
3258 text,
3259 (unsigned long long)uuid[UI_CURRENT],
3260 (unsigned long long)uuid[UI_BITMAP],
3261 (unsigned long long)uuid[UI_HISTORY_START],
3262 (unsigned long long)uuid[UI_HISTORY_END],
3263 (unsigned long long)bits,
3264 (unsigned long long)flags);
3265}
3266
3267/*
3268 100 after split brain try auto recover
3269 2 C_SYNC_SOURCE set BitMap
3270 1 C_SYNC_SOURCE use BitMap
3271 0 no Sync
3272 -1 C_SYNC_TARGET use BitMap
3273 -2 C_SYNC_TARGET set BitMap
3274 -100 after split brain, disconnect
3275-1000 unrelated data
4a23f264
PR
3276-1091 requires proto 91
3277-1096 requires proto 96
b411b363 3278 */
f2d3d75b
LE
3279
3280static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
b411b363 3281{
44a4d551
LE
3282 struct drbd_peer_device *const peer_device = first_peer_device(device);
3283 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
b411b363
PR
3284 u64 self, peer;
3285 int i, j;
3286
b30ab791
AG
3287 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3288 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
3289
3290 *rule_nr = 10;
3291 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
3292 return 0;
3293
3294 *rule_nr = 20;
3295 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
3296 peer != UUID_JUST_CREATED)
3297 return -2;
3298
3299 *rule_nr = 30;
3300 if (self != UUID_JUST_CREATED &&
3301 (peer == UUID_JUST_CREATED || peer == (u64)0))
3302 return 2;
3303
3304 if (self == peer) {
3305 int rct, dc; /* roles at crash time */
3306
b30ab791 3307 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
b411b363 3308
44a4d551 3309 if (connection->agreed_pro_version < 91)
4a23f264 3310 return -1091;
b411b363 3311
b30ab791
AG
3312 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
3313 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
d0180171 3314 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
b30ab791
AG
3315 drbd_uuid_move_history(device);
3316 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3317 device->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 3318
b30ab791
AG
3319 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3320 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
b411b363
PR
3321 *rule_nr = 34;
3322 } else {
d0180171 3323 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
b411b363
PR
3324 *rule_nr = 36;
3325 }
3326
3327 return 1;
3328 }
3329
b30ab791 3330 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
b411b363 3331
44a4d551 3332 if (connection->agreed_pro_version < 91)
4a23f264 3333 return -1091;
b411b363 3334
b30ab791
AG
3335 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
3336 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
d0180171 3337 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
b411b363 3338
b30ab791
AG
3339 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
3340 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
3341 device->p_uuid[UI_BITMAP] = 0UL;
b411b363 3342
b30ab791 3343 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
b411b363
PR
3344 *rule_nr = 35;
3345 } else {
d0180171 3346 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
b411b363
PR
3347 *rule_nr = 37;
3348 }
3349
3350 return -1;
3351 }
3352
3353 /* Common power [off|failure] */
b30ab791
AG
3354 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3355 (device->p_uuid[UI_FLAGS] & 2);
b411b363
PR
3356 /* lowest bit is set when we were primary,
3357 * next bit (weight 2) is set when peer was primary */
3358 *rule_nr = 40;
3359
f2d3d75b
LE
3360 /* Neither has the "crashed primary" flag set,
3361 * only a replication link hickup. */
3362 if (rct == 0)
3363 return 0;
3364
3365 /* Current UUID equal and no bitmap uuid; does not necessarily
3366 * mean this was a "simultaneous hard crash", maybe IO was
3367 * frozen, so no UUID-bump happened.
3368 * This is a protocol change, overload DRBD_FF_WSAME as flag
3369 * for "new-enough" peer DRBD version. */
3370 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3371 *rule_nr = 41;
3372 if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3373 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3374 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3375 }
3376 if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3377 /* At least one has the "crashed primary" bit set,
3378 * both are primary now, but neither has rotated its UUIDs?
3379 * "Can not happen." */
3380 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3381 return -100;
3382 }
3383 if (device->state.role == R_PRIMARY)
3384 return 1;
3385 return -1;
3386 }
3387
3388 /* Both are secondary.
3389 * Really looks like recovery from simultaneous hard crash.
3390 * Check which had been primary before, and arbitrate. */
b411b363 3391 switch (rct) {
f2d3d75b 3392 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
b411b363
PR
3393 case 1: /* self_pri && !peer_pri */ return 1;
3394 case 2: /* !self_pri && peer_pri */ return -1;
3395 case 3: /* self_pri && peer_pri */
44a4d551 3396 dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
b411b363
PR
3397 return dc ? -1 : 1;
3398 }
3399 }
3400
3401 *rule_nr = 50;
b30ab791 3402 peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
b411b363
PR
3403 if (self == peer)
3404 return -1;
3405
3406 *rule_nr = 51;
b30ab791 3407 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
b411b363 3408 if (self == peer) {
44a4d551 3409 if (connection->agreed_pro_version < 96 ?
b30ab791
AG
3410 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3411 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3412 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
3413 /* The last P_SYNC_UUID did not get though. Undo the last start of
3414 resync as sync source modifications of the peer's UUIDs. */
3415
44a4d551 3416 if (connection->agreed_pro_version < 91)
4a23f264 3417 return -1091;
b411b363 3418
b30ab791
AG
3419 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3420 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
4a23f264 3421
d0180171 3422 drbd_info(device, "Lost last syncUUID packet, corrected:\n");
b30ab791 3423 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
4a23f264 3424
b411b363
PR
3425 return -1;
3426 }
3427 }
3428
3429 *rule_nr = 60;
b30ab791 3430 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
b411b363 3431 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3432 peer = device->p_uuid[i] & ~((u64)1);
b411b363
PR
3433 if (self == peer)
3434 return -2;
3435 }
3436
3437 *rule_nr = 70;
b30ab791
AG
3438 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3439 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
3440 if (self == peer)
3441 return 1;
3442
3443 *rule_nr = 71;
b30ab791 3444 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
b411b363 3445 if (self == peer) {
44a4d551 3446 if (connection->agreed_pro_version < 96 ?
b30ab791
AG
3447 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3448 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3449 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
3450 /* The last P_SYNC_UUID did not get though. Undo the last start of
3451 resync as sync source modifications of our UUIDs. */
3452
44a4d551 3453 if (connection->agreed_pro_version < 91)
4a23f264 3454 return -1091;
b411b363 3455
b30ab791
AG
3456 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3457 __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
b411b363 3458
d0180171 3459 drbd_info(device, "Last syncUUID did not get through, corrected:\n");
b30ab791
AG
3460 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3461 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
b411b363
PR
3462
3463 return 1;
3464 }
3465 }
3466
3467
3468 *rule_nr = 80;
b30ab791 3469 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363 3470 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3471 self = device->ldev->md.uuid[i] & ~((u64)1);
b411b363
PR
3472 if (self == peer)
3473 return 2;
3474 }
3475
3476 *rule_nr = 90;
b30ab791
AG
3477 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3478 peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
b411b363
PR
3479 if (self == peer && self != ((u64)0))
3480 return 100;
3481
3482 *rule_nr = 100;
3483 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3484 self = device->ldev->md.uuid[i] & ~((u64)1);
b411b363 3485 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
b30ab791 3486 peer = device->p_uuid[j] & ~((u64)1);
b411b363
PR
3487 if (self == peer)
3488 return -100;
3489 }
3490 }
3491
3492 return -1000;
3493}
3494
3495/* drbd_sync_handshake() returns the new conn state on success, or
3496 CONN_MASK (-1) on failure.
3497 */
69a22773
AG
3498static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3499 enum drbd_role peer_role,
b411b363
PR
3500 enum drbd_disk_state peer_disk) __must_hold(local)
3501{
69a22773 3502 struct drbd_device *device = peer_device->device;
b411b363
PR
3503 enum drbd_conns rv = C_MASK;
3504 enum drbd_disk_state mydisk;
44ed167d 3505 struct net_conf *nc;
d29e89e3 3506 int hg, rule_nr, rr_conflict, tentative, always_asbp;
b411b363 3507
b30ab791 3508 mydisk = device->state.disk;
b411b363 3509 if (mydisk == D_NEGOTIATING)
b30ab791 3510 mydisk = device->new_state_tmp.disk;
b411b363 3511
d0180171 3512 drbd_info(device, "drbd_sync_handshake:\n");
9f2247bb 3513
b30ab791
AG
3514 spin_lock_irq(&device->ldev->md.uuid_lock);
3515 drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3516 drbd_uuid_dump(device, "peer", device->p_uuid,
3517 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
b411b363 3518
f2d3d75b 3519 hg = drbd_uuid_compare(device, peer_role, &rule_nr);
b30ab791 3520 spin_unlock_irq(&device->ldev->md.uuid_lock);
b411b363 3521
d0180171 3522 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
b411b363
PR
3523
3524 if (hg == -1000) {
d0180171 3525 drbd_alert(device, "Unrelated data, aborting!\n");
b411b363
PR
3526 return C_MASK;
3527 }
f2d3d75b
LE
3528 if (hg < -0x10000) {
3529 int proto, fflags;
3530 hg = -hg;
3531 proto = hg & 0xff;
3532 fflags = (hg >> 8) & 0xff;
3533 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3534 proto, fflags);
3535 return C_MASK;
3536 }
4a23f264 3537 if (hg < -1000) {
d0180171 3538 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
b411b363
PR
3539 return C_MASK;
3540 }
3541
3542 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3543 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
3544 int f = (hg == -100) || abs(hg) == 2;
3545 hg = mydisk > D_INCONSISTENT ? 1 : -1;
3546 if (f)
3547 hg = hg*2;
d0180171 3548 drbd_info(device, "Becoming sync %s due to disk states.\n",
b411b363
PR
3549 hg > 0 ? "source" : "target");
3550 }
3551
3a11a487 3552 if (abs(hg) == 100)
b30ab791 3553 drbd_khelper(device, "initial-split-brain");
3a11a487 3554
44ed167d 3555 rcu_read_lock();
69a22773 3556 nc = rcu_dereference(peer_device->connection->net_conf);
d29e89e3
RK
3557 always_asbp = nc->always_asbp;
3558 rr_conflict = nc->rr_conflict;
3559 tentative = nc->tentative;
3560 rcu_read_unlock();
44ed167d 3561
d29e89e3 3562 if (hg == 100 || (hg == -100 && always_asbp)) {
b30ab791 3563 int pcount = (device->state.role == R_PRIMARY)
b411b363
PR
3564 + (peer_role == R_PRIMARY);
3565 int forced = (hg == -100);
3566
3567 switch (pcount) {
3568 case 0:
69a22773 3569 hg = drbd_asb_recover_0p(peer_device);
b411b363
PR
3570 break;
3571 case 1:
69a22773 3572 hg = drbd_asb_recover_1p(peer_device);
b411b363
PR
3573 break;
3574 case 2:
69a22773 3575 hg = drbd_asb_recover_2p(peer_device);
b411b363
PR
3576 break;
3577 }
3578 if (abs(hg) < 100) {
d0180171 3579 drbd_warn(device, "Split-Brain detected, %d primaries, "
b411b363
PR
3580 "automatically solved. Sync from %s node\n",
3581 pcount, (hg < 0) ? "peer" : "this");
3582 if (forced) {
d0180171 3583 drbd_warn(device, "Doing a full sync, since"
b411b363
PR
3584 " UUIDs where ambiguous.\n");
3585 hg = hg*2;
3586 }
3587 }
3588 }
3589
3590 if (hg == -100) {
b30ab791 3591 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
b411b363 3592 hg = -1;
b30ab791 3593 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
b411b363
PR
3594 hg = 1;
3595
3596 if (abs(hg) < 100)
d0180171 3597 drbd_warn(device, "Split-Brain detected, manually solved. "
b411b363
PR
3598 "Sync from %s node\n",
3599 (hg < 0) ? "peer" : "this");
3600 }
3601
3602 if (hg == -100) {
580b9767
LE
3603 /* FIXME this log message is not correct if we end up here
3604 * after an attempted attach on a diskless node.
3605 * We just refuse to attach -- well, we drop the "connection"
3606 * to that disk, in a way... */
d0180171 3607 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
b30ab791 3608 drbd_khelper(device, "split-brain");
b411b363
PR
3609 return C_MASK;
3610 }
3611
3612 if (hg > 0 && mydisk <= D_INCONSISTENT) {
d0180171 3613 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
b411b363
PR
3614 return C_MASK;
3615 }
3616
3617 if (hg < 0 && /* by intention we do not use mydisk here. */
b30ab791 3618 device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
44ed167d 3619 switch (rr_conflict) {
b411b363 3620 case ASB_CALL_HELPER:
b30ab791 3621 drbd_khelper(device, "pri-lost");
b411b363
PR
3622 /* fall through */
3623 case ASB_DISCONNECT:
d0180171 3624 drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
b411b363
PR
3625 return C_MASK;
3626 case ASB_VIOLENTLY:
d0180171 3627 drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
b411b363
PR
3628 "assumption\n");
3629 }
3630 }
3631
69a22773 3632 if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
cf14c2e9 3633 if (hg == 0)
d0180171 3634 drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
cf14c2e9 3635 else
d0180171 3636 drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
cf14c2e9
PR
3637 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3638 abs(hg) >= 2 ? "full" : "bit-map based");
3639 return C_MASK;
3640 }
3641
b411b363 3642 if (abs(hg) >= 2) {
d0180171 3643 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
b30ab791 3644 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
20ceb2b2 3645 BM_LOCKED_SET_ALLOWED))
b411b363
PR
3646 return C_MASK;
3647 }
3648
3649 if (hg > 0) { /* become sync source. */
3650 rv = C_WF_BITMAP_S;
3651 } else if (hg < 0) { /* become sync target */
3652 rv = C_WF_BITMAP_T;
3653 } else {
3654 rv = C_CONNECTED;
b30ab791 3655 if (drbd_bm_total_weight(device)) {
d0180171 3656 drbd_info(device, "No resync, but %lu bits in bitmap!\n",
b30ab791 3657 drbd_bm_total_weight(device));
b411b363
PR
3658 }
3659 }
3660
3661 return rv;
3662}
3663
f179d76d 3664static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
b411b363
PR
3665{
3666 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
f179d76d
PR
3667 if (peer == ASB_DISCARD_REMOTE)
3668 return ASB_DISCARD_LOCAL;
b411b363
PR
3669
3670 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
f179d76d
PR
3671 if (peer == ASB_DISCARD_LOCAL)
3672 return ASB_DISCARD_REMOTE;
b411b363
PR
3673
3674 /* everything else is valid if they are equal on both sides. */
f179d76d 3675 return peer;
b411b363
PR
3676}
3677
bde89a9e 3678static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3679{
e658983a 3680 struct p_protocol *p = pi->data;
036b17ea
PR
3681 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3682 int p_proto, p_discard_my_data, p_two_primaries, cf;
3683 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3684 char integrity_alg[SHARED_SECRET_MAX] = "";
3d0e6375 3685 struct crypto_shash *peer_integrity_tfm = NULL;
7aca6c75 3686 void *int_dig_in = NULL, *int_dig_vv = NULL;
b411b363 3687
b411b363
PR
3688 p_proto = be32_to_cpu(p->protocol);
3689 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
3690 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
3691 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
b411b363 3692 p_two_primaries = be32_to_cpu(p->two_primaries);
cf14c2e9 3693 cf = be32_to_cpu(p->conn_flags);
6139f60d 3694 p_discard_my_data = cf & CF_DISCARD_MY_DATA;
cf14c2e9 3695
bde89a9e 3696 if (connection->agreed_pro_version >= 87) {
86db0618 3697 int err;
cf14c2e9 3698
88104ca4 3699 if (pi->size > sizeof(integrity_alg))
86db0618 3700 return -EIO;
bde89a9e 3701 err = drbd_recv_all(connection, integrity_alg, pi->size);
86db0618
AG
3702 if (err)
3703 return err;
036b17ea 3704 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
b411b363
PR
3705 }
3706
7d4c782c 3707 if (pi->cmd != P_PROTOCOL_UPDATE) {
bde89a9e 3708 clear_bit(CONN_DRY_RUN, &connection->flags);
b411b363 3709
fbc12f45 3710 if (cf & CF_DRY_RUN)
bde89a9e 3711 set_bit(CONN_DRY_RUN, &connection->flags);
b411b363 3712
fbc12f45 3713 rcu_read_lock();
bde89a9e 3714 nc = rcu_dereference(connection->net_conf);
b411b363 3715
fbc12f45 3716 if (p_proto != nc->wire_protocol) {
1ec861eb 3717 drbd_err(connection, "incompatible %s settings\n", "protocol");
fbc12f45
AG
3718 goto disconnect_rcu_unlock;
3719 }
b411b363 3720
fbc12f45 3721 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
1ec861eb 3722 drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
fbc12f45
AG
3723 goto disconnect_rcu_unlock;
3724 }
b411b363 3725
fbc12f45 3726 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
1ec861eb 3727 drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
fbc12f45
AG
3728 goto disconnect_rcu_unlock;
3729 }
b411b363 3730
fbc12f45 3731 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
1ec861eb 3732 drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
fbc12f45
AG
3733 goto disconnect_rcu_unlock;
3734 }
b411b363 3735
fbc12f45 3736 if (p_discard_my_data && nc->discard_my_data) {
1ec861eb 3737 drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
fbc12f45
AG
3738 goto disconnect_rcu_unlock;
3739 }
b411b363 3740
fbc12f45 3741 if (p_two_primaries != nc->two_primaries) {
1ec861eb 3742 drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
fbc12f45
AG
3743 goto disconnect_rcu_unlock;
3744 }
b411b363 3745
fbc12f45 3746 if (strcmp(integrity_alg, nc->integrity_alg)) {
1ec861eb 3747 drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
fbc12f45
AG
3748 goto disconnect_rcu_unlock;
3749 }
b411b363 3750
fbc12f45 3751 rcu_read_unlock();
b411b363
PR
3752 }
3753
7d4c782c
AG
3754 if (integrity_alg[0]) {
3755 int hash_size;
3756
3757 /*
3758 * We can only change the peer data integrity algorithm
3759 * here. Changing our own data integrity algorithm
3760 * requires that we send a P_PROTOCOL_UPDATE packet at
3761 * the same time; otherwise, the peer has no way to
3762 * tell between which packets the algorithm should
3763 * change.
3764 */
b411b363 3765
3d234b33 3766 peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0);
1b57e663
LE
3767 if (IS_ERR(peer_integrity_tfm)) {
3768 peer_integrity_tfm = NULL;
1ec861eb 3769 drbd_err(connection, "peer data-integrity-alg %s not supported\n",
7d4c782c
AG
3770 integrity_alg);
3771 goto disconnect;
3772 }
b411b363 3773
3d0e6375 3774 hash_size = crypto_shash_digestsize(peer_integrity_tfm);
7d4c782c
AG
3775 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3776 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3777 if (!(int_dig_in && int_dig_vv)) {
1ec861eb 3778 drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
b411b363
PR
3779 goto disconnect;
3780 }
b411b363
PR
3781 }
3782
7d4c782c
AG
3783 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3784 if (!new_net_conf) {
1ec861eb 3785 drbd_err(connection, "Allocation of new net_conf failed\n");
7d4c782c
AG
3786 goto disconnect;
3787 }
3788
bde89a9e 3789 mutex_lock(&connection->data.mutex);
0500813f 3790 mutex_lock(&connection->resource->conf_update);
bde89a9e 3791 old_net_conf = connection->net_conf;
7d4c782c
AG
3792 *new_net_conf = *old_net_conf;
3793
3794 new_net_conf->wire_protocol = p_proto;
3795 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3796 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3797 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3798 new_net_conf->two_primaries = p_two_primaries;
3799
bde89a9e 3800 rcu_assign_pointer(connection->net_conf, new_net_conf);
0500813f 3801 mutex_unlock(&connection->resource->conf_update);
bde89a9e 3802 mutex_unlock(&connection->data.mutex);
7d4c782c 3803
3d0e6375 3804 crypto_free_shash(connection->peer_integrity_tfm);
bde89a9e
AG
3805 kfree(connection->int_dig_in);
3806 kfree(connection->int_dig_vv);
3807 connection->peer_integrity_tfm = peer_integrity_tfm;
3808 connection->int_dig_in = int_dig_in;
3809 connection->int_dig_vv = int_dig_vv;
7d4c782c
AG
3810
3811 if (strcmp(old_net_conf->integrity_alg, integrity_alg))
1ec861eb 3812 drbd_info(connection, "peer data-integrity-alg: %s\n",
7d4c782c
AG
3813 integrity_alg[0] ? integrity_alg : "(none)");
3814
3815 synchronize_rcu();
3816 kfree(old_net_conf);
82bc0194 3817 return 0;
b411b363 3818
44ed167d
PR
3819disconnect_rcu_unlock:
3820 rcu_read_unlock();
b411b363 3821disconnect:
3d0e6375 3822 crypto_free_shash(peer_integrity_tfm);
036b17ea
PR
3823 kfree(int_dig_in);
3824 kfree(int_dig_vv);
bde89a9e 3825 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3826 return -EIO;
b411b363
PR
3827}
3828
3829/* helper function
3830 * input: alg name, feature name
3831 * return: NULL (alg name was "")
3832 * ERR_PTR(error) if something goes wrong
3833 * or the crypto hash ptr, if it worked out ok. */
3d0e6375
KC
3834static struct crypto_shash *drbd_crypto_alloc_digest_safe(
3835 const struct drbd_device *device,
b411b363
PR
3836 const char *alg, const char *name)
3837{
3d0e6375 3838 struct crypto_shash *tfm;
b411b363
PR
3839
3840 if (!alg[0])
3841 return NULL;
3842
3d0e6375 3843 tfm = crypto_alloc_shash(alg, 0, 0);
b411b363 3844 if (IS_ERR(tfm)) {
d0180171 3845 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
b411b363
PR
3846 alg, name, PTR_ERR(tfm));
3847 return tfm;
3848 }
b411b363
PR
3849 return tfm;
3850}
3851
bde89a9e 3852static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
4a76b161 3853{
bde89a9e 3854 void *buffer = connection->data.rbuf;
4a76b161
AG
3855 int size = pi->size;
3856
3857 while (size) {
3858 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
bde89a9e 3859 s = drbd_recv(connection, buffer, s);
4a76b161
AG
3860 if (s <= 0) {
3861 if (s < 0)
3862 return s;
3863 break;
3864 }
3865 size -= s;
3866 }
3867 if (size)
3868 return -EIO;
3869 return 0;
3870}
3871
3872/*
3873 * config_unknown_volume - device configuration command for unknown volume
3874 *
3875 * When a device is added to an existing connection, the node on which the
3876 * device is added first will send configuration commands to its peer but the
3877 * peer will not know about the device yet. It will warn and ignore these
3878 * commands. Once the device is added on the second node, the second node will
3879 * send the same device configuration commands, but in the other direction.
3880 *
3881 * (We can also end up here if drbd is misconfigured.)
3882 */
bde89a9e 3883static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
4a76b161 3884{
1ec861eb 3885 drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
2fcb8f30 3886 cmdname(pi->cmd), pi->vnr);
bde89a9e 3887 return ignore_remaining_packet(connection, pi);
4a76b161
AG
3888}
3889
bde89a9e 3890static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3891{
9f4fe9ad 3892 struct drbd_peer_device *peer_device;
b30ab791 3893 struct drbd_device *device;
e658983a 3894 struct p_rs_param_95 *p;
b411b363 3895 unsigned int header_size, data_size, exp_max_sz;
3d0e6375
KC
3896 struct crypto_shash *verify_tfm = NULL;
3897 struct crypto_shash *csums_tfm = NULL;
2ec91e0e 3898 struct net_conf *old_net_conf, *new_net_conf = NULL;
813472ce 3899 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
bde89a9e 3900 const int apv = connection->agreed_pro_version;
813472ce 3901 struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
778f271d 3902 int fifo_size = 0;
82bc0194 3903 int err;
b411b363 3904
9f4fe9ad
AG
3905 peer_device = conn_peer_device(connection, pi->vnr);
3906 if (!peer_device)
bde89a9e 3907 return config_unknown_volume(connection, pi);
9f4fe9ad 3908 device = peer_device->device;
b411b363
PR
3909
3910 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
3911 : apv == 88 ? sizeof(struct p_rs_param)
3912 + SHARED_SECRET_MAX
8e26f9cc
PR
3913 : apv <= 94 ? sizeof(struct p_rs_param_89)
3914 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 3915
e2857216 3916 if (pi->size > exp_max_sz) {
d0180171 3917 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
e2857216 3918 pi->size, exp_max_sz);
82bc0194 3919 return -EIO;
b411b363
PR
3920 }
3921
3922 if (apv <= 88) {
e658983a 3923 header_size = sizeof(struct p_rs_param);
e2857216 3924 data_size = pi->size - header_size;
8e26f9cc 3925 } else if (apv <= 94) {
e658983a 3926 header_size = sizeof(struct p_rs_param_89);
e2857216 3927 data_size = pi->size - header_size;
0b0ba1ef 3928 D_ASSERT(device, data_size == 0);
8e26f9cc 3929 } else {
e658983a 3930 header_size = sizeof(struct p_rs_param_95);
e2857216 3931 data_size = pi->size - header_size;
0b0ba1ef 3932 D_ASSERT(device, data_size == 0);
b411b363
PR
3933 }
3934
3935 /* initialize verify_alg and csums_alg */
e658983a 3936 p = pi->data;
b411b363
PR
3937 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3938
9f4fe9ad 3939 err = drbd_recv_all(peer_device->connection, p, header_size);
82bc0194
AG
3940 if (err)
3941 return err;
b411b363 3942
0500813f 3943 mutex_lock(&connection->resource->conf_update);
9f4fe9ad 3944 old_net_conf = peer_device->connection->net_conf;
b30ab791 3945 if (get_ldev(device)) {
813472ce
PR
3946 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3947 if (!new_disk_conf) {
b30ab791 3948 put_ldev(device);
0500813f 3949 mutex_unlock(&connection->resource->conf_update);
d0180171 3950 drbd_err(device, "Allocation of new disk_conf failed\n");
813472ce
PR
3951 return -ENOMEM;
3952 }
daeda1cc 3953
b30ab791 3954 old_disk_conf = device->ldev->disk_conf;
813472ce 3955 *new_disk_conf = *old_disk_conf;
b411b363 3956
6394b935 3957 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
813472ce 3958 }
b411b363
PR
3959
3960 if (apv >= 88) {
3961 if (apv == 88) {
5de73827 3962 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
d0180171 3963 drbd_err(device, "verify-alg of wrong size, "
5de73827
PR
3964 "peer wants %u, accepting only up to %u byte\n",
3965 data_size, SHARED_SECRET_MAX);
813472ce
PR
3966 err = -EIO;
3967 goto reconnect;
b411b363
PR
3968 }
3969
9f4fe9ad 3970 err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
813472ce
PR
3971 if (err)
3972 goto reconnect;
b411b363
PR
3973 /* we expect NUL terminated string */
3974 /* but just in case someone tries to be evil */
0b0ba1ef 3975 D_ASSERT(device, p->verify_alg[data_size-1] == 0);
b411b363
PR
3976 p->verify_alg[data_size-1] = 0;
3977
3978 } else /* apv >= 89 */ {
3979 /* we still expect NUL terminated strings */
3980 /* but just in case someone tries to be evil */
0b0ba1ef
AG
3981 D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3982 D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
b411b363
PR
3983 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3984 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3985 }
3986
2ec91e0e 3987 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
b30ab791 3988 if (device->state.conn == C_WF_REPORT_PARAMS) {
d0180171 3989 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3990 old_net_conf->verify_alg, p->verify_alg);
b411b363
PR
3991 goto disconnect;
3992 }
b30ab791 3993 verify_tfm = drbd_crypto_alloc_digest_safe(device,
b411b363
PR
3994 p->verify_alg, "verify-alg");
3995 if (IS_ERR(verify_tfm)) {
3996 verify_tfm = NULL;
3997 goto disconnect;
3998 }
3999 }
4000
2ec91e0e 4001 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
b30ab791 4002 if (device->state.conn == C_WF_REPORT_PARAMS) {
d0180171 4003 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 4004 old_net_conf->csums_alg, p->csums_alg);
b411b363
PR
4005 goto disconnect;
4006 }
b30ab791 4007 csums_tfm = drbd_crypto_alloc_digest_safe(device,
b411b363
PR
4008 p->csums_alg, "csums-alg");
4009 if (IS_ERR(csums_tfm)) {
4010 csums_tfm = NULL;
4011 goto disconnect;
4012 }
4013 }
4014
813472ce 4015 if (apv > 94 && new_disk_conf) {
daeda1cc
PR
4016 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
4017 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
4018 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
4019 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
778f271d 4020
daeda1cc 4021 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
b30ab791 4022 if (fifo_size != device->rs_plan_s->size) {
813472ce
PR
4023 new_plan = fifo_alloc(fifo_size);
4024 if (!new_plan) {
d0180171 4025 drbd_err(device, "kmalloc of fifo_buffer failed");
b30ab791 4026 put_ldev(device);
778f271d
PR
4027 goto disconnect;
4028 }
4029 }
8e26f9cc 4030 }
b411b363 4031
91fd4dad 4032 if (verify_tfm || csums_tfm) {
2ec91e0e
PR
4033 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
4034 if (!new_net_conf) {
d0180171 4035 drbd_err(device, "Allocation of new net_conf failed\n");
91fd4dad
PR
4036 goto disconnect;
4037 }
4038
2ec91e0e 4039 *new_net_conf = *old_net_conf;
91fd4dad
PR
4040
4041 if (verify_tfm) {
2ec91e0e
PR
4042 strcpy(new_net_conf->verify_alg, p->verify_alg);
4043 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3d0e6375 4044 crypto_free_shash(peer_device->connection->verify_tfm);
9f4fe9ad 4045 peer_device->connection->verify_tfm = verify_tfm;
d0180171 4046 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
91fd4dad
PR
4047 }
4048 if (csums_tfm) {
2ec91e0e
PR
4049 strcpy(new_net_conf->csums_alg, p->csums_alg);
4050 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3d0e6375 4051 crypto_free_shash(peer_device->connection->csums_tfm);
9f4fe9ad 4052 peer_device->connection->csums_tfm = csums_tfm;
d0180171 4053 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
91fd4dad 4054 }
bde89a9e 4055 rcu_assign_pointer(connection->net_conf, new_net_conf);
778f271d 4056 }
b411b363
PR
4057 }
4058
813472ce 4059 if (new_disk_conf) {
b30ab791
AG
4060 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4061 put_ldev(device);
813472ce
PR
4062 }
4063
4064 if (new_plan) {
b30ab791
AG
4065 old_plan = device->rs_plan_s;
4066 rcu_assign_pointer(device->rs_plan_s, new_plan);
b411b363 4067 }
daeda1cc 4068
0500813f 4069 mutex_unlock(&connection->resource->conf_update);
daeda1cc
PR
4070 synchronize_rcu();
4071 if (new_net_conf)
4072 kfree(old_net_conf);
4073 kfree(old_disk_conf);
813472ce 4074 kfree(old_plan);
daeda1cc 4075
82bc0194 4076 return 0;
b411b363 4077
813472ce
PR
4078reconnect:
4079 if (new_disk_conf) {
b30ab791 4080 put_ldev(device);
813472ce
PR
4081 kfree(new_disk_conf);
4082 }
0500813f 4083 mutex_unlock(&connection->resource->conf_update);
813472ce
PR
4084 return -EIO;
4085
b411b363 4086disconnect:
813472ce
PR
4087 kfree(new_plan);
4088 if (new_disk_conf) {
b30ab791 4089 put_ldev(device);
813472ce
PR
4090 kfree(new_disk_conf);
4091 }
0500813f 4092 mutex_unlock(&connection->resource->conf_update);
b411b363
PR
4093 /* just for completeness: actually not needed,
4094 * as this is not reached if csums_tfm was ok. */
3d0e6375 4095 crypto_free_shash(csums_tfm);
b411b363 4096 /* but free the verify_tfm again, if csums_tfm did not work out */
3d0e6375 4097 crypto_free_shash(verify_tfm);
9f4fe9ad 4098 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4099 return -EIO;
b411b363
PR
4100}
4101
b411b363 4102/* warn if the arguments differ by more than 12.5% */
b30ab791 4103static void warn_if_differ_considerably(struct drbd_device *device,
b411b363
PR
4104 const char *s, sector_t a, sector_t b)
4105{
4106 sector_t d;
4107 if (a == 0 || b == 0)
4108 return;
4109 d = (a > b) ? (a - b) : (b - a);
4110 if (d > (a>>3) || d > (b>>3))
d0180171 4111 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
b411b363
PR
4112 (unsigned long long)a, (unsigned long long)b);
4113}
4114
bde89a9e 4115static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4116{
9f4fe9ad 4117 struct drbd_peer_device *peer_device;
b30ab791 4118 struct drbd_device *device;
e658983a 4119 struct p_sizes *p = pi->data;
9104d31a 4120 struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
e96c9633 4121 enum determine_dev_size dd = DS_UNCHANGED;
6a8d68b1 4122 sector_t p_size, p_usize, p_csize, my_usize;
94c43a13 4123 sector_t new_size, cur_size;
b411b363 4124 int ldsc = 0; /* local disk size changed */
e89b591c 4125 enum dds_flags ddsf;
b411b363 4126
9f4fe9ad
AG
4127 peer_device = conn_peer_device(connection, pi->vnr);
4128 if (!peer_device)
bde89a9e 4129 return config_unknown_volume(connection, pi);
9f4fe9ad 4130 device = peer_device->device;
94c43a13 4131 cur_size = drbd_get_capacity(device->this_bdev);
4a76b161 4132
b411b363
PR
4133 p_size = be64_to_cpu(p->d_size);
4134 p_usize = be64_to_cpu(p->u_size);
6a8d68b1 4135 p_csize = be64_to_cpu(p->c_size);
b411b363 4136
b411b363
PR
4137 /* just store the peer's disk size for now.
4138 * we still need to figure out whether we accept that. */
b30ab791 4139 device->p_size = p_size;
b411b363 4140
b30ab791 4141 if (get_ldev(device)) {
daeda1cc 4142 rcu_read_lock();
b30ab791 4143 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
daeda1cc
PR
4144 rcu_read_unlock();
4145
b30ab791
AG
4146 warn_if_differ_considerably(device, "lower level device sizes",
4147 p_size, drbd_get_max_capacity(device->ldev));
4148 warn_if_differ_considerably(device, "user requested size",
daeda1cc 4149 p_usize, my_usize);
b411b363
PR
4150
4151 /* if this is the first connect, or an otherwise expected
4152 * param exchange, choose the minimum */
b30ab791 4153 if (device->state.conn == C_WF_REPORT_PARAMS)
daeda1cc 4154 p_usize = min_not_zero(my_usize, p_usize);
b411b363 4155
ad6e8979
LE
4156 /* Never shrink a device with usable data during connect,
4157 * or "attach" on the peer.
4158 * But allow online shrinking if we are connected. */
60bac040 4159 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
60bac040 4160 if (new_size < cur_size &&
b30ab791 4161 device->state.disk >= D_OUTDATED &&
ad6e8979 4162 (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) {
60bac040
LE
4163 drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
4164 (unsigned long long)new_size, (unsigned long long)cur_size);
9f4fe9ad 4165 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
b30ab791 4166 put_ldev(device);
82bc0194 4167 return -EIO;
b411b363 4168 }
daeda1cc
PR
4169
4170 if (my_usize != p_usize) {
4171 struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
4172
4173 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
4174 if (!new_disk_conf) {
d0180171 4175 drbd_err(device, "Allocation of new disk_conf failed\n");
b30ab791 4176 put_ldev(device);
daeda1cc
PR
4177 return -ENOMEM;
4178 }
4179
0500813f 4180 mutex_lock(&connection->resource->conf_update);
b30ab791 4181 old_disk_conf = device->ldev->disk_conf;
daeda1cc
PR
4182 *new_disk_conf = *old_disk_conf;
4183 new_disk_conf->disk_size = p_usize;
4184
b30ab791 4185 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
0500813f 4186 mutex_unlock(&connection->resource->conf_update);
daeda1cc
PR
4187 synchronize_rcu();
4188 kfree(old_disk_conf);
4189
ad6e8979
LE
4190 drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n",
4191 (unsigned long)p_usize, (unsigned long)my_usize);
b411b363 4192 }
daeda1cc 4193
b30ab791 4194 put_ldev(device);
b411b363 4195 }
b411b363 4196
20c68fde 4197 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
dd4f699d 4198 /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
20c68fde 4199 In case we cleared the QUEUE_FLAG_DISCARD from our queue in
dd4f699d 4200 drbd_reconsider_queue_parameters(), we can be sure that after
20c68fde
LE
4201 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
4202
e89b591c 4203 ddsf = be16_to_cpu(p->dds_flags);
b30ab791 4204 if (get_ldev(device)) {
9104d31a 4205 drbd_reconsider_queue_parameters(device, device->ldev, o);
b30ab791
AG
4206 dd = drbd_determine_dev_size(device, ddsf, NULL);
4207 put_ldev(device);
e96c9633 4208 if (dd == DS_ERROR)
82bc0194 4209 return -EIO;
b30ab791 4210 drbd_md_sync(device);
b411b363 4211 } else {
6a8d68b1
LE
4212 /*
4213 * I am diskless, need to accept the peer's *current* size.
4214 * I must NOT accept the peers backing disk size,
4215 * it may have been larger than mine all along...
4216 *
4217 * At this point, the peer knows more about my disk, or at
4218 * least about what we last agreed upon, than myself.
4219 * So if his c_size is less than his d_size, the most likely
4220 * reason is that *my* d_size was smaller last time we checked.
4221 *
4222 * However, if he sends a zero current size,
4223 * take his (user-capped or) backing disk size anyways.
94c43a13
LE
4224 *
4225 * Unless of course he does not have a disk himself.
4226 * In which case we ignore this completely.
6a8d68b1 4227 */
94c43a13 4228 sector_t new_size = p_csize ?: p_usize ?: p_size;
9104d31a 4229 drbd_reconsider_queue_parameters(device, NULL, o);
94c43a13
LE
4230 if (new_size == 0) {
4231 /* Ignore, peer does not know nothing. */
4232 } else if (new_size == cur_size) {
4233 /* nothing to do */
4234 } else if (cur_size != 0 && p_size == 0) {
4235 drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
4236 (unsigned long long)new_size, (unsigned long long)cur_size);
4237 } else if (new_size < cur_size && device->state.role == R_PRIMARY) {
4238 drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
4239 (unsigned long long)new_size, (unsigned long long)cur_size);
4240 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4241 return -EIO;
4242 } else {
4243 /* I believe the peer, if
4244 * - I don't have a current size myself
4245 * - we agree on the size anyways
4246 * - I do have a current size, am Secondary,
4247 * and he has the only disk
4248 * - I do have a current size, am Primary,
4249 * and he has the only disk,
4250 * which is larger than my current size
4251 */
4252 drbd_set_my_capacity(device, new_size);
4253 }
b411b363
PR
4254 }
4255
b30ab791
AG
4256 if (get_ldev(device)) {
4257 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
4258 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
b411b363
PR
4259 ldsc = 1;
4260 }
4261
b30ab791 4262 put_ldev(device);
b411b363
PR
4263 }
4264
b30ab791 4265 if (device->state.conn > C_WF_REPORT_PARAMS) {
b411b363 4266 if (be64_to_cpu(p->c_size) !=
b30ab791 4267 drbd_get_capacity(device->this_bdev) || ldsc) {
b411b363
PR
4268 /* we have different sizes, probably peer
4269 * needs to know my new size... */
69a22773 4270 drbd_send_sizes(peer_device, 0, ddsf);
b411b363 4271 }
b30ab791
AG
4272 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
4273 (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
4274 if (device->state.pdsk >= D_INCONSISTENT &&
4275 device->state.disk >= D_INCONSISTENT) {
e89b591c 4276 if (ddsf & DDSF_NO_RESYNC)
d0180171 4277 drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
e89b591c 4278 else
b30ab791 4279 resync_after_online_grow(device);
e89b591c 4280 } else
b30ab791 4281 set_bit(RESYNC_AFTER_NEG, &device->flags);
b411b363
PR
4282 }
4283 }
4284
82bc0194 4285 return 0;
b411b363
PR
4286}
4287
bde89a9e 4288static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4289{
9f4fe9ad 4290 struct drbd_peer_device *peer_device;
b30ab791 4291 struct drbd_device *device;
e658983a 4292 struct p_uuids *p = pi->data;
b411b363 4293 u64 *p_uuid;
62b0da3a 4294 int i, updated_uuids = 0;
b411b363 4295
9f4fe9ad
AG
4296 peer_device = conn_peer_device(connection, pi->vnr);
4297 if (!peer_device)
bde89a9e 4298 return config_unknown_volume(connection, pi);
9f4fe9ad 4299 device = peer_device->device;
4a76b161 4300
365cf663 4301 p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO);
063eacf8 4302 if (!p_uuid) {
d0180171 4303 drbd_err(device, "kmalloc of p_uuid failed\n");
063eacf8
JW
4304 return false;
4305 }
b411b363
PR
4306
4307 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
4308 p_uuid[i] = be64_to_cpu(p->uuid[i]);
4309
b30ab791
AG
4310 kfree(device->p_uuid);
4311 device->p_uuid = p_uuid;
b411b363 4312
b17b5960 4313 if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
b30ab791
AG
4314 device->state.disk < D_INCONSISTENT &&
4315 device->state.role == R_PRIMARY &&
4316 (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
d0180171 4317 drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
b30ab791 4318 (unsigned long long)device->ed_uuid);
9f4fe9ad 4319 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4320 return -EIO;
b411b363
PR
4321 }
4322
b30ab791 4323 if (get_ldev(device)) {
b411b363 4324 int skip_initial_sync =
b30ab791 4325 device->state.conn == C_CONNECTED &&
9f4fe9ad 4326 peer_device->connection->agreed_pro_version >= 90 &&
b30ab791 4327 device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
b411b363
PR
4328 (p_uuid[UI_FLAGS] & 8);
4329 if (skip_initial_sync) {
d0180171 4330 drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
b30ab791 4331 drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
20ceb2b2
LE
4332 "clear_n_write from receive_uuids",
4333 BM_LOCKED_TEST_ALLOWED);
b30ab791
AG
4334 _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
4335 _drbd_uuid_set(device, UI_BITMAP, 0);
4336 _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
b411b363 4337 CS_VERBOSE, NULL);
b30ab791 4338 drbd_md_sync(device);
62b0da3a 4339 updated_uuids = 1;
b411b363 4340 }
b30ab791
AG
4341 put_ldev(device);
4342 } else if (device->state.disk < D_INCONSISTENT &&
4343 device->state.role == R_PRIMARY) {
18a50fa2
PR
4344 /* I am a diskless primary, the peer just created a new current UUID
4345 for me. */
b30ab791 4346 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
b411b363
PR
4347 }
4348
4349 /* Before we test for the disk state, we should wait until an eventually
4350 ongoing cluster wide state change is finished. That is important if
4351 we are primary and are detaching from our disk. We need to see the
4352 new disk state... */
b30ab791
AG
4353 mutex_lock(device->state_mutex);
4354 mutex_unlock(device->state_mutex);
4355 if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
4356 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
62b0da3a
LE
4357
4358 if (updated_uuids)
b30ab791 4359 drbd_print_uuids(device, "receiver updated UUIDs to");
b411b363 4360
82bc0194 4361 return 0;
b411b363
PR
4362}
4363
4364/**
4365 * convert_state() - Converts the peer's view of the cluster state to our point of view
4366 * @ps: The state as seen by the peer.
4367 */
4368static union drbd_state convert_state(union drbd_state ps)
4369{
4370 union drbd_state ms;
4371
4372 static enum drbd_conns c_tab[] = {
369bea63 4373 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
b411b363
PR
4374 [C_CONNECTED] = C_CONNECTED,
4375
4376 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
4377 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
4378 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
4379 [C_VERIFY_S] = C_VERIFY_T,
4380 [C_MASK] = C_MASK,
4381 };
4382
4383 ms.i = ps.i;
4384
4385 ms.conn = c_tab[ps.conn];
4386 ms.peer = ps.role;
4387 ms.role = ps.peer;
4388 ms.pdsk = ps.disk;
4389 ms.disk = ps.pdsk;
4390 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
4391
4392 return ms;
4393}
4394
bde89a9e 4395static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4396{
9f4fe9ad 4397 struct drbd_peer_device *peer_device;
b30ab791 4398 struct drbd_device *device;
e658983a 4399 struct p_req_state *p = pi->data;
b411b363 4400 union drbd_state mask, val;
bf885f8a 4401 enum drbd_state_rv rv;
b411b363 4402
9f4fe9ad
AG
4403 peer_device = conn_peer_device(connection, pi->vnr);
4404 if (!peer_device)
4a76b161 4405 return -EIO;
9f4fe9ad 4406 device = peer_device->device;
4a76b161 4407
b411b363
PR
4408 mask.i = be32_to_cpu(p->mask);
4409 val.i = be32_to_cpu(p->val);
4410
9f4fe9ad 4411 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
b30ab791 4412 mutex_is_locked(device->state_mutex)) {
69a22773 4413 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
82bc0194 4414 return 0;
b411b363
PR
4415 }
4416
4417 mask = convert_state(mask);
4418 val = convert_state(val);
4419
b30ab791 4420 rv = drbd_change_state(device, CS_VERBOSE, mask, val);
69a22773 4421 drbd_send_sr_reply(peer_device, rv);
b411b363 4422
b30ab791 4423 drbd_md_sync(device);
b411b363 4424
82bc0194 4425 return 0;
b411b363
PR
4426}
4427
bde89a9e 4428static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4429{
e658983a 4430 struct p_req_state *p = pi->data;
b411b363 4431 union drbd_state mask, val;
bf885f8a 4432 enum drbd_state_rv rv;
b411b363 4433
b411b363
PR
4434 mask.i = be32_to_cpu(p->mask);
4435 val.i = be32_to_cpu(p->val);
4436
bde89a9e
AG
4437 if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4438 mutex_is_locked(&connection->cstate_mutex)) {
4439 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
82bc0194 4440 return 0;
b411b363
PR
4441 }
4442
4443 mask = convert_state(mask);
4444 val = convert_state(val);
4445
bde89a9e
AG
4446 rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4447 conn_send_sr_reply(connection, rv);
b411b363 4448
82bc0194 4449 return 0;
b411b363
PR
4450}
4451
bde89a9e 4452static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4453{
9f4fe9ad 4454 struct drbd_peer_device *peer_device;
b30ab791 4455 struct drbd_device *device;
e658983a 4456 struct p_state *p = pi->data;
4ac4aada 4457 union drbd_state os, ns, peer_state;
b411b363 4458 enum drbd_disk_state real_peer_disk;
65d922c3 4459 enum chg_state_flags cs_flags;
b411b363
PR
4460 int rv;
4461
9f4fe9ad
AG
4462 peer_device = conn_peer_device(connection, pi->vnr);
4463 if (!peer_device)
bde89a9e 4464 return config_unknown_volume(connection, pi);
9f4fe9ad 4465 device = peer_device->device;
4a76b161 4466
b411b363
PR
4467 peer_state.i = be32_to_cpu(p->state);
4468
4469 real_peer_disk = peer_state.disk;
4470 if (peer_state.disk == D_NEGOTIATING) {
b30ab791 4471 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
d0180171 4472 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
b411b363
PR
4473 }
4474
0500813f 4475 spin_lock_irq(&device->resource->req_lock);
b411b363 4476 retry:
b30ab791 4477 os = ns = drbd_read_state(device);
0500813f 4478 spin_unlock_irq(&device->resource->req_lock);
b411b363 4479
668700b4 4480 /* If some other part of the code (ack_receiver thread, timeout)
545752d5
LE
4481 * already decided to close the connection again,
4482 * we must not "re-establish" it here. */
4483 if (os.conn <= C_TEAR_DOWN)
58ffa580 4484 return -ECONNRESET;
545752d5 4485
40424e4a
LE
4486 /* If this is the "end of sync" confirmation, usually the peer disk
4487 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4488 * set) resync started in PausedSyncT, or if the timing of pause-/
4489 * unpause-sync events has been "just right", the peer disk may
4490 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4491 */
4492 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4493 real_peer_disk == D_UP_TO_DATE &&
e9ef7bb6
LE
4494 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4495 /* If we are (becoming) SyncSource, but peer is still in sync
4496 * preparation, ignore its uptodate-ness to avoid flapping, it
4497 * will change to inconsistent once the peer reaches active
4498 * syncing states.
4499 * It may have changed syncer-paused flags, however, so we
4500 * cannot ignore this completely. */
4501 if (peer_state.conn > C_CONNECTED &&
4502 peer_state.conn < C_SYNC_SOURCE)
4503 real_peer_disk = D_INCONSISTENT;
4504
4505 /* if peer_state changes to connected at the same time,
4506 * it explicitly notifies us that it finished resync.
4507 * Maybe we should finish it up, too? */
4508 else if (os.conn >= C_SYNC_SOURCE &&
4509 peer_state.conn == C_CONNECTED) {
b30ab791
AG
4510 if (drbd_bm_total_weight(device) <= device->rs_failed)
4511 drbd_resync_finished(device);
82bc0194 4512 return 0;
e9ef7bb6
LE
4513 }
4514 }
4515
02b91b55
LE
4516 /* explicit verify finished notification, stop sector reached. */
4517 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4518 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
b30ab791
AG
4519 ov_out_of_sync_print(device);
4520 drbd_resync_finished(device);
58ffa580 4521 return 0;
02b91b55
LE
4522 }
4523
e9ef7bb6
LE
4524 /* peer says his disk is inconsistent, while we think it is uptodate,
4525 * and this happens while the peer still thinks we have a sync going on,
4526 * but we think we are already done with the sync.
4527 * We ignore this to avoid flapping pdsk.
4528 * This should not happen, if the peer is a recent version of drbd. */
4529 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4530 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4531 real_peer_disk = D_UP_TO_DATE;
4532
4ac4aada
LE
4533 if (ns.conn == C_WF_REPORT_PARAMS)
4534 ns.conn = C_CONNECTED;
b411b363 4535
67531718
PR
4536 if (peer_state.conn == C_AHEAD)
4537 ns.conn = C_BEHIND;
4538
fe43ed97
LE
4539 /* TODO:
4540 * if (primary and diskless and peer uuid != effective uuid)
4541 * abort attach on peer;
4542 *
4543 * If this node does not have good data, was already connected, but
4544 * the peer did a late attach only now, trying to "negotiate" with me,
4545 * AND I am currently Primary, possibly frozen, with some specific
4546 * "effective" uuid, this should never be reached, really, because
4547 * we first send the uuids, then the current state.
4548 *
4549 * In this scenario, we already dropped the connection hard
4550 * when we received the unsuitable uuids (receive_uuids().
4551 *
4552 * Should we want to change this, that is: not drop the connection in
4553 * receive_uuids() already, then we would need to add a branch here
4554 * that aborts the attach of "unsuitable uuids" on the peer in case
4555 * this node is currently Diskless Primary.
4556 */
4557
b30ab791
AG
4558 if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4559 get_ldev_if_state(device, D_NEGOTIATING)) {
b411b363
PR
4560 int cr; /* consider resync */
4561
4562 /* if we established a new connection */
4ac4aada 4563 cr = (os.conn < C_CONNECTED);
b411b363
PR
4564 /* if we had an established connection
4565 * and one of the nodes newly attaches a disk */
4ac4aada 4566 cr |= (os.conn == C_CONNECTED &&
b411b363 4567 (peer_state.disk == D_NEGOTIATING ||
4ac4aada 4568 os.disk == D_NEGOTIATING));
b411b363 4569 /* if we have both been inconsistent, and the peer has been
a2823ea9 4570 * forced to be UpToDate with --force */
b30ab791 4571 cr |= test_bit(CONSIDER_RESYNC, &device->flags);
b411b363
PR
4572 /* if we had been plain connected, and the admin requested to
4573 * start a sync by "invalidate" or "invalidate-remote" */
4ac4aada 4574 cr |= (os.conn == C_CONNECTED &&
b411b363
PR
4575 (peer_state.conn >= C_STARTING_SYNC_S &&
4576 peer_state.conn <= C_WF_BITMAP_T));
4577
4578 if (cr)
69a22773 4579 ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
b411b363 4580
b30ab791 4581 put_ldev(device);
4ac4aada
LE
4582 if (ns.conn == C_MASK) {
4583 ns.conn = C_CONNECTED;
b30ab791
AG
4584 if (device->state.disk == D_NEGOTIATING) {
4585 drbd_force_state(device, NS(disk, D_FAILED));
b411b363 4586 } else if (peer_state.disk == D_NEGOTIATING) {
d0180171 4587 drbd_err(device, "Disk attach process on the peer node was aborted.\n");
b411b363 4588 peer_state.disk = D_DISKLESS;
580b9767 4589 real_peer_disk = D_DISKLESS;
b411b363 4590 } else {
9f4fe9ad 4591 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
82bc0194 4592 return -EIO;
0b0ba1ef 4593 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
9f4fe9ad 4594 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4595 return -EIO;
b411b363
PR
4596 }
4597 }
4598 }
4599
0500813f 4600 spin_lock_irq(&device->resource->req_lock);
b30ab791 4601 if (os.i != drbd_read_state(device).i)
b411b363 4602 goto retry;
b30ab791 4603 clear_bit(CONSIDER_RESYNC, &device->flags);
b411b363
PR
4604 ns.peer = peer_state.role;
4605 ns.pdsk = real_peer_disk;
4606 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4ac4aada 4607 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
b30ab791 4608 ns.disk = device->new_state_tmp.disk;
4ac4aada 4609 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
b30ab791
AG
4610 if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4611 test_bit(NEW_CUR_UUID, &device->flags)) {
8554df1c 4612 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
481c6f50 4613 for temporal network outages! */
0500813f 4614 spin_unlock_irq(&device->resource->req_lock);
d0180171 4615 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
9f4fe9ad 4616 tl_clear(peer_device->connection);
b30ab791
AG
4617 drbd_uuid_new_current(device);
4618 clear_bit(NEW_CUR_UUID, &device->flags);
9f4fe9ad 4619 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
82bc0194 4620 return -EIO;
481c6f50 4621 }
b30ab791
AG
4622 rv = _drbd_set_state(device, ns, cs_flags, NULL);
4623 ns = drbd_read_state(device);
0500813f 4624 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
4625
4626 if (rv < SS_SUCCESS) {
9f4fe9ad 4627 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4628 return -EIO;
b411b363
PR
4629 }
4630
4ac4aada
LE
4631 if (os.conn > C_WF_REPORT_PARAMS) {
4632 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
b411b363
PR
4633 peer_state.disk != D_NEGOTIATING ) {
4634 /* we want resync, peer has not yet decided to sync... */
4635 /* Nowadays only used when forcing a node into primary role and
4636 setting its disk to UpToDate with that */
69a22773
AG
4637 drbd_send_uuids(peer_device);
4638 drbd_send_current_state(peer_device);
b411b363
PR
4639 }
4640 }
4641
b30ab791 4642 clear_bit(DISCARD_MY_DATA, &device->flags);
b411b363 4643
b30ab791 4644 drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
b411b363 4645
82bc0194 4646 return 0;
b411b363
PR
4647}
4648
bde89a9e 4649static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4650{
9f4fe9ad 4651 struct drbd_peer_device *peer_device;
b30ab791 4652 struct drbd_device *device;
e658983a 4653 struct p_rs_uuid *p = pi->data;
4a76b161 4654
9f4fe9ad
AG
4655 peer_device = conn_peer_device(connection, pi->vnr);
4656 if (!peer_device)
4a76b161 4657 return -EIO;
9f4fe9ad 4658 device = peer_device->device;
b411b363 4659
b30ab791
AG
4660 wait_event(device->misc_wait,
4661 device->state.conn == C_WF_SYNC_UUID ||
4662 device->state.conn == C_BEHIND ||
4663 device->state.conn < C_CONNECTED ||
4664 device->state.disk < D_NEGOTIATING);
b411b363 4665
0b0ba1ef 4666 /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */
b411b363 4667
b411b363
PR
4668 /* Here the _drbd_uuid_ functions are right, current should
4669 _not_ be rotated into the history */
b30ab791
AG
4670 if (get_ldev_if_state(device, D_NEGOTIATING)) {
4671 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4672 _drbd_uuid_set(device, UI_BITMAP, 0UL);
b411b363 4673
b30ab791
AG
4674 drbd_print_uuids(device, "updated sync uuid");
4675 drbd_start_resync(device, C_SYNC_TARGET);
b411b363 4676
b30ab791 4677 put_ldev(device);
b411b363 4678 } else
d0180171 4679 drbd_err(device, "Ignoring SyncUUID packet!\n");
b411b363 4680
82bc0194 4681 return 0;
b411b363
PR
4682}
4683
2c46407d
AG
4684/**
4685 * receive_bitmap_plain
4686 *
4687 * Return 0 when done, 1 when another iteration is needed, and a negative error
4688 * code upon failure.
4689 */
4690static int
69a22773 4691receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
e658983a 4692 unsigned long *p, struct bm_xfer_ctx *c)
b411b363 4693{
50d0b1ad 4694 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
69a22773 4695 drbd_header_size(peer_device->connection);
e658983a 4696 unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 4697 c->bm_words - c->word_offset);
e658983a 4698 unsigned int want = num_words * sizeof(*p);
2c46407d 4699 int err;
b411b363 4700
50d0b1ad 4701 if (want != size) {
69a22773 4702 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
2c46407d 4703 return -EIO;
b411b363
PR
4704 }
4705 if (want == 0)
2c46407d 4706 return 0;
69a22773 4707 err = drbd_recv_all(peer_device->connection, p, want);
82bc0194 4708 if (err)
2c46407d 4709 return err;
b411b363 4710
69a22773 4711 drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
b411b363
PR
4712
4713 c->word_offset += num_words;
4714 c->bit_offset = c->word_offset * BITS_PER_LONG;
4715 if (c->bit_offset > c->bm_bits)
4716 c->bit_offset = c->bm_bits;
4717
2c46407d 4718 return 1;
b411b363
PR
4719}
4720
a02d1240
AG
4721static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4722{
4723 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4724}
4725
4726static int dcbp_get_start(struct p_compressed_bm *p)
4727{
4728 return (p->encoding & 0x80) != 0;
4729}
4730
4731static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4732{
4733 return (p->encoding >> 4) & 0x7;
4734}
4735
2c46407d
AG
4736/**
4737 * recv_bm_rle_bits
4738 *
4739 * Return 0 when done, 1 when another iteration is needed, and a negative error
4740 * code upon failure.
4741 */
4742static int
69a22773 4743recv_bm_rle_bits(struct drbd_peer_device *peer_device,
b411b363 4744 struct p_compressed_bm *p,
c6d25cfe
PR
4745 struct bm_xfer_ctx *c,
4746 unsigned int len)
b411b363
PR
4747{
4748 struct bitstream bs;
4749 u64 look_ahead;
4750 u64 rl;
4751 u64 tmp;
4752 unsigned long s = c->bit_offset;
4753 unsigned long e;
a02d1240 4754 int toggle = dcbp_get_start(p);
b411b363
PR
4755 int have;
4756 int bits;
4757
a02d1240 4758 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
b411b363
PR
4759
4760 bits = bitstream_get_bits(&bs, &look_ahead, 64);
4761 if (bits < 0)
2c46407d 4762 return -EIO;
b411b363
PR
4763
4764 for (have = bits; have > 0; s += rl, toggle = !toggle) {
4765 bits = vli_decode_bits(&rl, look_ahead);
4766 if (bits <= 0)
2c46407d 4767 return -EIO;
b411b363
PR
4768
4769 if (toggle) {
4770 e = s + rl -1;
4771 if (e >= c->bm_bits) {
69a22773 4772 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
2c46407d 4773 return -EIO;
b411b363 4774 }
69a22773 4775 _drbd_bm_set_bits(peer_device->device, s, e);
b411b363
PR
4776 }
4777
4778 if (have < bits) {
69a22773 4779 drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
b411b363
PR
4780 have, bits, look_ahead,
4781 (unsigned int)(bs.cur.b - p->code),
4782 (unsigned int)bs.buf_len);
2c46407d 4783 return -EIO;
b411b363 4784 }
d2da5b0c
LE
4785 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4786 if (likely(bits < 64))
4787 look_ahead >>= bits;
4788 else
4789 look_ahead = 0;
b411b363
PR
4790 have -= bits;
4791
4792 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4793 if (bits < 0)
2c46407d 4794 return -EIO;
b411b363
PR
4795 look_ahead |= tmp << have;
4796 have += bits;
4797 }
4798
4799 c->bit_offset = s;
4800 bm_xfer_ctx_bit_to_word_offset(c);
4801
2c46407d 4802 return (s != c->bm_bits);
b411b363
PR
4803}
4804
2c46407d
AG
4805/**
4806 * decode_bitmap_c
4807 *
4808 * Return 0 when done, 1 when another iteration is needed, and a negative error
4809 * code upon failure.
4810 */
4811static int
69a22773 4812decode_bitmap_c(struct drbd_peer_device *peer_device,
b411b363 4813 struct p_compressed_bm *p,
c6d25cfe
PR
4814 struct bm_xfer_ctx *c,
4815 unsigned int len)
b411b363 4816{
a02d1240 4817 if (dcbp_get_code(p) == RLE_VLI_Bits)
69a22773 4818 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
b411b363
PR
4819
4820 /* other variants had been implemented for evaluation,
4821 * but have been dropped as this one turned out to be "best"
4822 * during all our tests. */
4823
69a22773
AG
4824 drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4825 conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
2c46407d 4826 return -EIO;
b411b363
PR
4827}
4828
b30ab791 4829void INFO_bm_xfer_stats(struct drbd_device *device,
b411b363
PR
4830 const char *direction, struct bm_xfer_ctx *c)
4831{
4832 /* what would it take to transfer it "plaintext" */
a6b32bc3 4833 unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
50d0b1ad
AG
4834 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4835 unsigned int plain =
4836 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4837 c->bm_words * sizeof(unsigned long);
4838 unsigned int total = c->bytes[0] + c->bytes[1];
4839 unsigned int r;
b411b363
PR
4840
4841 /* total can not be zero. but just in case: */
4842 if (total == 0)
4843 return;
4844
4845 /* don't report if not compressed */
4846 if (total >= plain)
4847 return;
4848
4849 /* total < plain. check for overflow, still */
4850 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4851 : (1000 * total / plain);
4852
4853 if (r > 1000)
4854 r = 1000;
4855
4856 r = 1000 - r;
d0180171 4857 drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
b411b363
PR
4858 "total %u; compression: %u.%u%%\n",
4859 direction,
4860 c->bytes[1], c->packets[1],
4861 c->bytes[0], c->packets[0],
4862 total, r/10, r % 10);
4863}
4864
4865/* Since we are processing the bitfield from lower addresses to higher,
4866 it does not matter if the process it in 32 bit chunks or 64 bit
4867 chunks as long as it is little endian. (Understand it as byte stream,
4868 beginning with the lowest byte...) If we would use big endian
4869 we would need to process it from the highest address to the lowest,
4870 in order to be agnostic to the 32 vs 64 bits issue.
4871
4872 returns 0 on failure, 1 if we successfully received it. */
bde89a9e 4873static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4874{
9f4fe9ad 4875 struct drbd_peer_device *peer_device;
b30ab791 4876 struct drbd_device *device;
b411b363 4877 struct bm_xfer_ctx c;
2c46407d 4878 int err;
4a76b161 4879
9f4fe9ad
AG
4880 peer_device = conn_peer_device(connection, pi->vnr);
4881 if (!peer_device)
4a76b161 4882 return -EIO;
9f4fe9ad 4883 device = peer_device->device;
b411b363 4884
b30ab791 4885 drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
20ceb2b2
LE
4886 /* you are supposed to send additional out-of-sync information
4887 * if you actually set bits during this phase */
b411b363 4888
b411b363 4889 c = (struct bm_xfer_ctx) {
b30ab791
AG
4890 .bm_bits = drbd_bm_bits(device),
4891 .bm_words = drbd_bm_words(device),
b411b363
PR
4892 };
4893
2c46407d 4894 for(;;) {
e658983a 4895 if (pi->cmd == P_BITMAP)
69a22773 4896 err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
e658983a 4897 else if (pi->cmd == P_COMPRESSED_BITMAP) {
b411b363
PR
4898 /* MAYBE: sanity check that we speak proto >= 90,
4899 * and the feature is enabled! */
e658983a 4900 struct p_compressed_bm *p = pi->data;
b411b363 4901
bde89a9e 4902 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
d0180171 4903 drbd_err(device, "ReportCBitmap packet too large\n");
82bc0194 4904 err = -EIO;
b411b363
PR
4905 goto out;
4906 }
e658983a 4907 if (pi->size <= sizeof(*p)) {
d0180171 4908 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
82bc0194 4909 err = -EIO;
78fcbdae 4910 goto out;
b411b363 4911 }
9f4fe9ad 4912 err = drbd_recv_all(peer_device->connection, p, pi->size);
e658983a
AG
4913 if (err)
4914 goto out;
69a22773 4915 err = decode_bitmap_c(peer_device, p, &c, pi->size);
b411b363 4916 } else {
d0180171 4917 drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
82bc0194 4918 err = -EIO;
b411b363
PR
4919 goto out;
4920 }
4921
e2857216 4922 c.packets[pi->cmd == P_BITMAP]++;
bde89a9e 4923 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
b411b363 4924
2c46407d
AG
4925 if (err <= 0) {
4926 if (err < 0)
4927 goto out;
b411b363 4928 break;
2c46407d 4929 }
9f4fe9ad 4930 err = drbd_recv_header(peer_device->connection, pi);
82bc0194 4931 if (err)
b411b363 4932 goto out;
2c46407d 4933 }
b411b363 4934
b30ab791 4935 INFO_bm_xfer_stats(device, "receive", &c);
b411b363 4936
b30ab791 4937 if (device->state.conn == C_WF_BITMAP_T) {
de1f8e4a
AG
4938 enum drbd_state_rv rv;
4939
b30ab791 4940 err = drbd_send_bitmap(device);
82bc0194 4941 if (err)
b411b363
PR
4942 goto out;
4943 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
b30ab791 4944 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
0b0ba1ef 4945 D_ASSERT(device, rv == SS_SUCCESS);
b30ab791 4946 } else if (device->state.conn != C_WF_BITMAP_S) {
b411b363
PR
4947 /* admin may have requested C_DISCONNECTING,
4948 * other threads may have noticed network errors */
d0180171 4949 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
b30ab791 4950 drbd_conn_str(device->state.conn));
b411b363 4951 }
82bc0194 4952 err = 0;
b411b363 4953
b411b363 4954 out:
b30ab791
AG
4955 drbd_bm_unlock(device);
4956 if (!err && device->state.conn == C_WF_BITMAP_S)
4957 drbd_start_resync(device, C_SYNC_SOURCE);
82bc0194 4958 return err;
b411b363
PR
4959}
4960
bde89a9e 4961static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4962{
1ec861eb 4963 drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
e2857216 4964 pi->cmd, pi->size);
b411b363 4965
bde89a9e 4966 return ignore_remaining_packet(connection, pi);
b411b363
PR
4967}
4968
bde89a9e 4969static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
0ced55a3 4970{
e7f52dfb
LE
4971 /* Make sure we've acked all the TCP data associated
4972 * with the data requests being unplugged */
bde89a9e 4973 drbd_tcp_quickack(connection->data.socket);
0ced55a3 4974
82bc0194 4975 return 0;
0ced55a3
PR
4976}
4977
bde89a9e 4978static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
73a01a18 4979{
9f4fe9ad 4980 struct drbd_peer_device *peer_device;
b30ab791 4981 struct drbd_device *device;
e658983a 4982 struct p_block_desc *p = pi->data;
4a76b161 4983
9f4fe9ad
AG
4984 peer_device = conn_peer_device(connection, pi->vnr);
4985 if (!peer_device)
4a76b161 4986 return -EIO;
9f4fe9ad 4987 device = peer_device->device;
73a01a18 4988
b30ab791 4989 switch (device->state.conn) {
f735e363
LE
4990 case C_WF_SYNC_UUID:
4991 case C_WF_BITMAP_T:
4992 case C_BEHIND:
4993 break;
4994 default:
d0180171 4995 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
b30ab791 4996 drbd_conn_str(device->state.conn));
f735e363
LE
4997 }
4998
b30ab791 4999 drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
73a01a18 5000
82bc0194 5001 return 0;
73a01a18
PR
5002}
5003
700ca8c0
PR
5004static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
5005{
5006 struct drbd_peer_device *peer_device;
5007 struct p_block_desc *p = pi->data;
5008 struct drbd_device *device;
5009 sector_t sector;
5010 int size, err = 0;
5011
5012 peer_device = conn_peer_device(connection, pi->vnr);
5013 if (!peer_device)
5014 return -EIO;
5015 device = peer_device->device;
5016
5017 sector = be64_to_cpu(p->sector);
5018 size = be32_to_cpu(p->blksize);
5019
5020 dec_rs_pending(device);
5021
5022 if (get_ldev(device)) {
5023 struct drbd_peer_request *peer_req;
45c21793 5024 const int op = REQ_OP_WRITE_ZEROES;
700ca8c0
PR
5025
5026 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
9104d31a 5027 size, 0, GFP_NOIO);
700ca8c0
PR
5028 if (!peer_req) {
5029 put_ldev(device);
5030 return -ENOMEM;
5031 }
5032
5033 peer_req->w.cb = e_end_resync_block;
5034 peer_req->submit_jif = jiffies;
f31e583a 5035 peer_req->flags |= EE_TRIM;
700ca8c0
PR
5036
5037 spin_lock_irq(&device->resource->req_lock);
5038 list_add_tail(&peer_req->w.list, &device->sync_ee);
5039 spin_unlock_irq(&device->resource->req_lock);
5040
5041 atomic_add(pi->size >> 9, &device->rs_sect_ev);
5042 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
5043
5044 if (err) {
5045 spin_lock_irq(&device->resource->req_lock);
5046 list_del(&peer_req->w.list);
5047 spin_unlock_irq(&device->resource->req_lock);
5048
5049 drbd_free_peer_req(device, peer_req);
5050 put_ldev(device);
5051 err = 0;
5052 goto fail;
5053 }
5054
5055 inc_unacked(device);
5056
5057 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
5058 as well as drbd_rs_complete_io() */
5059 } else {
5060 fail:
5061 drbd_rs_complete_io(device, sector);
5062 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
5063 }
5064
5065 atomic_add(size >> 9, &device->rs_sect_in);
5066
5067 return err;
5068}
5069
02918be2
PR
5070struct data_cmd {
5071 int expect_payload;
9104d31a 5072 unsigned int pkt_size;
bde89a9e 5073 int (*fn)(struct drbd_connection *, struct packet_info *);
02918be2
PR
5074};
5075
5076static struct data_cmd drbd_cmd_handler[] = {
5077 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
5078 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
5079 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
5080 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
e658983a
AG
5081 [P_BITMAP] = { 1, 0, receive_bitmap } ,
5082 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
5083 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
02918be2
PR
5084 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
5085 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
e658983a
AG
5086 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
5087 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
02918be2
PR
5088 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
5089 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
5090 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
5091 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
5092 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
5093 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
5094 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
5095 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
5096 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
700ca8c0 5097 [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest },
02918be2 5098 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
73a01a18 5099 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4a76b161 5100 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
036b17ea 5101 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
a0fb3c47 5102 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
f31e583a 5103 [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data },
700ca8c0 5104 [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
9104d31a 5105 [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data },
b411b363
PR
5106};
5107
bde89a9e 5108static void drbdd(struct drbd_connection *connection)
b411b363 5109{
77351055 5110 struct packet_info pi;
02918be2 5111 size_t shs; /* sub header size */
82bc0194 5112 int err;
b411b363 5113
bde89a9e 5114 while (get_t_state(&connection->receiver) == RUNNING) {
9104d31a 5115 struct data_cmd const *cmd;
b411b363 5116
bde89a9e 5117 drbd_thread_current_set_cpu(&connection->receiver);
c51a0ef3
LE
5118 update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug);
5119 if (drbd_recv_header_maybe_unplug(connection, &pi))
02918be2 5120 goto err_out;
b411b363 5121
deebe195 5122 cmd = &drbd_cmd_handler[pi.cmd];
4a76b161 5123 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
1ec861eb 5124 drbd_err(connection, "Unexpected data packet %s (0x%04x)",
2fcb8f30 5125 cmdname(pi.cmd), pi.cmd);
02918be2 5126 goto err_out;
0b33a916 5127 }
b411b363 5128
e658983a 5129 shs = cmd->pkt_size;
9104d31a
LE
5130 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
5131 shs += sizeof(struct o_qlim);
e658983a 5132 if (pi.size > shs && !cmd->expect_payload) {
1ec861eb 5133 drbd_err(connection, "No payload expected %s l:%d\n",
2fcb8f30 5134 cmdname(pi.cmd), pi.size);
02918be2 5135 goto err_out;
b411b363 5136 }
9104d31a
LE
5137 if (pi.size < shs) {
5138 drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
5139 cmdname(pi.cmd), (int)shs, pi.size);
5140 goto err_out;
5141 }
b411b363 5142
c13f7e1a 5143 if (shs) {
944410e9 5144 update_receiver_timing_details(connection, drbd_recv_all_warn);
bde89a9e 5145 err = drbd_recv_all_warn(connection, pi.data, shs);
a5c31904 5146 if (err)
c13f7e1a 5147 goto err_out;
e2857216 5148 pi.size -= shs;
c13f7e1a
LE
5149 }
5150
944410e9 5151 update_receiver_timing_details(connection, cmd->fn);
bde89a9e 5152 err = cmd->fn(connection, &pi);
4a76b161 5153 if (err) {
1ec861eb 5154 drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
9f5bdc33 5155 cmdname(pi.cmd), err, pi.size);
02918be2 5156 goto err_out;
b411b363
PR
5157 }
5158 }
82bc0194 5159 return;
b411b363 5160
82bc0194 5161 err_out:
bde89a9e 5162 conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
5163}
5164
bde89a9e 5165static void conn_disconnect(struct drbd_connection *connection)
b411b363 5166{
c06ece6b 5167 struct drbd_peer_device *peer_device;
bbeb641c 5168 enum drbd_conns oc;
376694a0 5169 int vnr;
b411b363 5170
bde89a9e 5171 if (connection->cstate == C_STANDALONE)
b411b363 5172 return;
b411b363 5173
545752d5
LE
5174 /* We are about to start the cleanup after connection loss.
5175 * Make sure drbd_make_request knows about that.
5176 * Usually we should be in some network failure state already,
5177 * but just in case we are not, we fix it up here.
5178 */
bde89a9e 5179 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
545752d5 5180
668700b4 5181 /* ack_receiver does not clean up anything. it must not interfere, either */
1c03e520 5182 drbd_thread_stop(&connection->ack_receiver);
668700b4
PR
5183 if (connection->ack_sender) {
5184 destroy_workqueue(connection->ack_sender);
5185 connection->ack_sender = NULL;
5186 }
bde89a9e 5187 drbd_free_sock(connection);
360cc740 5188
c141ebda 5189 rcu_read_lock();
c06ece6b
AG
5190 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5191 struct drbd_device *device = peer_device->device;
b30ab791 5192 kref_get(&device->kref);
c141ebda 5193 rcu_read_unlock();
69a22773 5194 drbd_disconnected(peer_device);
c06ece6b 5195 kref_put(&device->kref, drbd_destroy_device);
c141ebda
PR
5196 rcu_read_lock();
5197 }
5198 rcu_read_unlock();
5199
bde89a9e 5200 if (!list_empty(&connection->current_epoch->list))
1ec861eb 5201 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
12038a3a 5202 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
bde89a9e
AG
5203 atomic_set(&connection->current_epoch->epoch_size, 0);
5204 connection->send.seen_any_write_yet = false;
12038a3a 5205
1ec861eb 5206 drbd_info(connection, "Connection closed\n");
360cc740 5207
bde89a9e
AG
5208 if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
5209 conn_try_outdate_peer_async(connection);
cb703454 5210
0500813f 5211 spin_lock_irq(&connection->resource->req_lock);
bde89a9e 5212 oc = connection->cstate;
bbeb641c 5213 if (oc >= C_UNCONNECTED)
bde89a9e 5214 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
bbeb641c 5215
0500813f 5216 spin_unlock_irq(&connection->resource->req_lock);
360cc740 5217
f3dfa40a 5218 if (oc == C_DISCONNECTING)
bde89a9e 5219 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
360cc740
PR
5220}
5221
69a22773 5222static int drbd_disconnected(struct drbd_peer_device *peer_device)
360cc740 5223{
69a22773 5224 struct drbd_device *device = peer_device->device;
360cc740 5225 unsigned int i;
b411b363 5226
85719573 5227 /* wait for current activity to cease. */
0500813f 5228 spin_lock_irq(&device->resource->req_lock);
b30ab791
AG
5229 _drbd_wait_ee_list_empty(device, &device->active_ee);
5230 _drbd_wait_ee_list_empty(device, &device->sync_ee);
5231 _drbd_wait_ee_list_empty(device, &device->read_ee);
0500813f 5232 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
5233
5234 /* We do not have data structures that would allow us to
5235 * get the rs_pending_cnt down to 0 again.
5236 * * On C_SYNC_TARGET we do not have any data structures describing
5237 * the pending RSDataRequest's we have sent.
5238 * * On C_SYNC_SOURCE there is no data structure that tracks
5239 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
5240 * And no, it is not the sum of the reference counts in the
5241 * resync_LRU. The resync_LRU tracks the whole operation including
5242 * the disk-IO, while the rs_pending_cnt only tracks the blocks
5243 * on the fly. */
b30ab791
AG
5244 drbd_rs_cancel_all(device);
5245 device->rs_total = 0;
5246 device->rs_failed = 0;
5247 atomic_set(&device->rs_pending_cnt, 0);
5248 wake_up(&device->misc_wait);
b411b363 5249
b30ab791 5250 del_timer_sync(&device->resync_timer);
2bccef39 5251 resync_timer_fn(&device->resync_timer);
b411b363 5252
b411b363
PR
5253 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
5254 * w_make_resync_request etc. which may still be on the worker queue
5255 * to be "canceled" */
b5043c5e 5256 drbd_flush_workqueue(&peer_device->connection->sender_work);
b411b363 5257
b30ab791 5258 drbd_finish_peer_reqs(device);
b411b363 5259
d10b4ea3
PR
5260 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
5261 might have issued a work again. The one before drbd_finish_peer_reqs() is
5262 necessary to reclain net_ee in drbd_finish_peer_reqs(). */
b5043c5e 5263 drbd_flush_workqueue(&peer_device->connection->sender_work);
d10b4ea3 5264
08332d73
LE
5265 /* need to do it again, drbd_finish_peer_reqs() may have populated it
5266 * again via drbd_try_clear_on_disk_bm(). */
b30ab791 5267 drbd_rs_cancel_all(device);
b411b363 5268
b30ab791
AG
5269 kfree(device->p_uuid);
5270 device->p_uuid = NULL;
b411b363 5271
b30ab791 5272 if (!drbd_suspended(device))
69a22773 5273 tl_clear(peer_device->connection);
b411b363 5274
b30ab791 5275 drbd_md_sync(device);
b411b363 5276
be115b69
LE
5277 if (get_ldev(device)) {
5278 drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
5279 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5280 put_ldev(device);
5281 }
20ceb2b2 5282
b411b363
PR
5283 /* tcp_close and release of sendpage pages can be deferred. I don't
5284 * want to use SO_LINGER, because apparently it can be deferred for
5285 * more than 20 seconds (longest time I checked).
5286 *
5287 * Actually we don't care for exactly when the network stack does its
5288 * put_page(), but release our reference on these pages right here.
5289 */
b30ab791 5290 i = drbd_free_peer_reqs(device, &device->net_ee);
b411b363 5291 if (i)
d0180171 5292 drbd_info(device, "net_ee not empty, killed %u entries\n", i);
b30ab791 5293 i = atomic_read(&device->pp_in_use_by_net);
435f0740 5294 if (i)
d0180171 5295 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
b30ab791 5296 i = atomic_read(&device->pp_in_use);
b411b363 5297 if (i)
d0180171 5298 drbd_info(device, "pp_in_use = %d, expected 0\n", i);
b411b363 5299
0b0ba1ef
AG
5300 D_ASSERT(device, list_empty(&device->read_ee));
5301 D_ASSERT(device, list_empty(&device->active_ee));
5302 D_ASSERT(device, list_empty(&device->sync_ee));
5303 D_ASSERT(device, list_empty(&device->done_ee));
b411b363 5304
360cc740 5305 return 0;
b411b363
PR
5306}
5307
5308/*
5309 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
5310 * we can agree on is stored in agreed_pro_version.
5311 *
5312 * feature flags and the reserved array should be enough room for future
5313 * enhancements of the handshake protocol, and possible plugins...
5314 *
5315 * for now, they are expected to be zero, but ignored.
5316 */
bde89a9e 5317static int drbd_send_features(struct drbd_connection *connection)
b411b363 5318{
9f5bdc33
AG
5319 struct drbd_socket *sock;
5320 struct p_connection_features *p;
b411b363 5321
bde89a9e
AG
5322 sock = &connection->data;
5323 p = conn_prepare_command(connection, sock);
9f5bdc33 5324 if (!p)
e8d17b01 5325 return -EIO;
b411b363
PR
5326 memset(p, 0, sizeof(*p));
5327 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
5328 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
20c68fde 5329 p->feature_flags = cpu_to_be32(PRO_FEATURES);
bde89a9e 5330 return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
b411b363
PR
5331}
5332
5333/*
5334 * return values:
5335 * 1 yes, we have a valid connection
5336 * 0 oops, did not work out, please try again
5337 * -1 peer talks different language,
5338 * no point in trying again, please go standalone.
5339 */
bde89a9e 5340static int drbd_do_features(struct drbd_connection *connection)
b411b363 5341{
bde89a9e 5342 /* ASSERT current == connection->receiver ... */
e658983a
AG
5343 struct p_connection_features *p;
5344 const int expect = sizeof(struct p_connection_features);
77351055 5345 struct packet_info pi;
a5c31904 5346 int err;
b411b363 5347
bde89a9e 5348 err = drbd_send_features(connection);
e8d17b01 5349 if (err)
b411b363
PR
5350 return 0;
5351
bde89a9e 5352 err = drbd_recv_header(connection, &pi);
69bc7bc3 5353 if (err)
b411b363
PR
5354 return 0;
5355
6038178e 5356 if (pi.cmd != P_CONNECTION_FEATURES) {
1ec861eb 5357 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
2fcb8f30 5358 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5359 return -1;
5360 }
5361
77351055 5362 if (pi.size != expect) {
1ec861eb 5363 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
77351055 5364 expect, pi.size);
b411b363
PR
5365 return -1;
5366 }
5367
e658983a 5368 p = pi.data;
bde89a9e 5369 err = drbd_recv_all_warn(connection, p, expect);
a5c31904 5370 if (err)
b411b363 5371 return 0;
b411b363 5372
b411b363
PR
5373 p->protocol_min = be32_to_cpu(p->protocol_min);
5374 p->protocol_max = be32_to_cpu(p->protocol_max);
5375 if (p->protocol_max == 0)
5376 p->protocol_max = p->protocol_min;
5377
5378 if (PRO_VERSION_MAX < p->protocol_min ||
5379 PRO_VERSION_MIN > p->protocol_max)
5380 goto incompat;
5381
bde89a9e 5382 connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
20c68fde 5383 connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
b411b363 5384
1ec861eb 5385 drbd_info(connection, "Handshake successful: "
bde89a9e 5386 "Agreed network protocol version %d\n", connection->agreed_pro_version);
b411b363 5387
f31e583a 5388 drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n",
9104d31a
LE
5389 connection->agreed_features,
5390 connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5391 connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
f31e583a
LE
5392 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "",
5393 connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" :
9104d31a 5394 connection->agreed_features ? "" : " none");
92d94ae6 5395
b411b363
PR
5396 return 1;
5397
5398 incompat:
1ec861eb 5399 drbd_err(connection, "incompatible DRBD dialects: "
b411b363
PR
5400 "I support %d-%d, peer supports %d-%d\n",
5401 PRO_VERSION_MIN, PRO_VERSION_MAX,
5402 p->protocol_min, p->protocol_max);
5403 return -1;
5404}
5405
5406#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
bde89a9e 5407static int drbd_do_auth(struct drbd_connection *connection)
b411b363 5408{
1ec861eb
AG
5409 drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
5410 drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
b10d96cb 5411 return -1;
b411b363
PR
5412}
5413#else
5414#define CHALLENGE_LEN 64
b10d96cb
JT
5415
5416/* Return value:
5417 1 - auth succeeded,
5418 0 - failed, try again (network error),
5419 -1 - auth failed, don't try again.
5420*/
5421
bde89a9e 5422static int drbd_do_auth(struct drbd_connection *connection)
b411b363 5423{
9f5bdc33 5424 struct drbd_socket *sock;
b411b363 5425 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
b411b363
PR
5426 char *response = NULL;
5427 char *right_response = NULL;
5428 char *peers_ch = NULL;
44ed167d
PR
5429 unsigned int key_len;
5430 char secret[SHARED_SECRET_MAX]; /* 64 byte */
b411b363 5431 unsigned int resp_size;
9534d671 5432 SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm);
77351055 5433 struct packet_info pi;
44ed167d 5434 struct net_conf *nc;
69bc7bc3 5435 int err, rv;
b411b363 5436
9f5bdc33 5437 /* FIXME: Put the challenge/response into the preallocated socket buffer. */
b411b363 5438
44ed167d 5439 rcu_read_lock();
bde89a9e 5440 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
5441 key_len = strlen(nc->shared_secret);
5442 memcpy(secret, nc->shared_secret, key_len);
5443 rcu_read_unlock();
5444
9534d671
HX
5445 desc->tfm = connection->cram_hmac_tfm;
5446 desc->flags = 0;
b411b363 5447
9534d671 5448 rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
b411b363 5449 if (rv) {
9534d671 5450 drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv);
b10d96cb 5451 rv = -1;
b411b363
PR
5452 goto fail;
5453 }
5454
5455 get_random_bytes(my_challenge, CHALLENGE_LEN);
5456
bde89a9e
AG
5457 sock = &connection->data;
5458 if (!conn_prepare_command(connection, sock)) {
9f5bdc33
AG
5459 rv = 0;
5460 goto fail;
5461 }
bde89a9e 5462 rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
9f5bdc33 5463 my_challenge, CHALLENGE_LEN);
b411b363
PR
5464 if (!rv)
5465 goto fail;
5466
bde89a9e 5467 err = drbd_recv_header(connection, &pi);
69bc7bc3
AG
5468 if (err) {
5469 rv = 0;
b411b363 5470 goto fail;
69bc7bc3 5471 }
b411b363 5472
77351055 5473 if (pi.cmd != P_AUTH_CHALLENGE) {
1ec861eb 5474 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
2fcb8f30 5475 cmdname(pi.cmd), pi.cmd);
9049ccd4 5476 rv = -1;
b411b363
PR
5477 goto fail;
5478 }
5479
77351055 5480 if (pi.size > CHALLENGE_LEN * 2) {
1ec861eb 5481 drbd_err(connection, "expected AuthChallenge payload too big.\n");
b10d96cb 5482 rv = -1;
b411b363
PR
5483 goto fail;
5484 }
5485
67cca286
PR
5486 if (pi.size < CHALLENGE_LEN) {
5487 drbd_err(connection, "AuthChallenge payload too small.\n");
5488 rv = -1;
5489 goto fail;
5490 }
5491
77351055 5492 peers_ch = kmalloc(pi.size, GFP_NOIO);
b411b363 5493 if (peers_ch == NULL) {
1ec861eb 5494 drbd_err(connection, "kmalloc of peers_ch failed\n");
b10d96cb 5495 rv = -1;
b411b363
PR
5496 goto fail;
5497 }
5498
bde89a9e 5499 err = drbd_recv_all_warn(connection, peers_ch, pi.size);
a5c31904 5500 if (err) {
b411b363
PR
5501 rv = 0;
5502 goto fail;
5503 }
5504
67cca286
PR
5505 if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
5506 drbd_err(connection, "Peer presented the same challenge!\n");
5507 rv = -1;
5508 goto fail;
5509 }
5510
9534d671 5511 resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
b411b363
PR
5512 response = kmalloc(resp_size, GFP_NOIO);
5513 if (response == NULL) {
1ec861eb 5514 drbd_err(connection, "kmalloc of response failed\n");
b10d96cb 5515 rv = -1;
b411b363
PR
5516 goto fail;
5517 }
5518
9534d671 5519 rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
b411b363 5520 if (rv) {
1ec861eb 5521 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 5522 rv = -1;
b411b363
PR
5523 goto fail;
5524 }
5525
bde89a9e 5526 if (!conn_prepare_command(connection, sock)) {
9f5bdc33 5527 rv = 0;
b411b363 5528 goto fail;
9f5bdc33 5529 }
bde89a9e 5530 rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
9f5bdc33 5531 response, resp_size);
b411b363
PR
5532 if (!rv)
5533 goto fail;
5534
bde89a9e 5535 err = drbd_recv_header(connection, &pi);
69bc7bc3 5536 if (err) {
b411b363
PR
5537 rv = 0;
5538 goto fail;
5539 }
5540
77351055 5541 if (pi.cmd != P_AUTH_RESPONSE) {
1ec861eb 5542 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
2fcb8f30 5543 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5544 rv = 0;
5545 goto fail;
5546 }
5547
77351055 5548 if (pi.size != resp_size) {
1ec861eb 5549 drbd_err(connection, "expected AuthResponse payload of wrong size\n");
b411b363
PR
5550 rv = 0;
5551 goto fail;
5552 }
b411b363 5553
bde89a9e 5554 err = drbd_recv_all_warn(connection, response , resp_size);
a5c31904 5555 if (err) {
b411b363
PR
5556 rv = 0;
5557 goto fail;
5558 }
5559
5560 right_response = kmalloc(resp_size, GFP_NOIO);
2d1ee87d 5561 if (right_response == NULL) {
1ec861eb 5562 drbd_err(connection, "kmalloc of right_response failed\n");
b10d96cb 5563 rv = -1;
b411b363
PR
5564 goto fail;
5565 }
5566
9534d671
HX
5567 rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
5568 right_response);
b411b363 5569 if (rv) {
1ec861eb 5570 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 5571 rv = -1;
b411b363
PR
5572 goto fail;
5573 }
5574
5575 rv = !memcmp(response, right_response, resp_size);
5576
5577 if (rv)
1ec861eb 5578 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
44ed167d 5579 resp_size);
b10d96cb
JT
5580 else
5581 rv = -1;
b411b363
PR
5582
5583 fail:
5584 kfree(peers_ch);
5585 kfree(response);
5586 kfree(right_response);
9534d671 5587 shash_desc_zero(desc);
b411b363
PR
5588
5589 return rv;
5590}
5591#endif
5592
8fe60551 5593int drbd_receiver(struct drbd_thread *thi)
b411b363 5594{
bde89a9e 5595 struct drbd_connection *connection = thi->connection;
b411b363
PR
5596 int h;
5597
1ec861eb 5598 drbd_info(connection, "receiver (re)started\n");
b411b363
PR
5599
5600 do {
bde89a9e 5601 h = conn_connect(connection);
b411b363 5602 if (h == 0) {
bde89a9e 5603 conn_disconnect(connection);
20ee6390 5604 schedule_timeout_interruptible(HZ);
b411b363
PR
5605 }
5606 if (h == -1) {
1ec861eb 5607 drbd_warn(connection, "Discarding network configuration.\n");
bde89a9e 5608 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
5609 }
5610 } while (h == 0);
5611
c51a0ef3
LE
5612 if (h > 0) {
5613 blk_start_plug(&connection->receiver_plug);
bde89a9e 5614 drbdd(connection);
c51a0ef3
LE
5615 blk_finish_plug(&connection->receiver_plug);
5616 }
b411b363 5617
bde89a9e 5618 conn_disconnect(connection);
b411b363 5619
1ec861eb 5620 drbd_info(connection, "receiver terminated\n");
b411b363
PR
5621 return 0;
5622}
5623
5624/* ********* acknowledge sender ******** */
5625
bde89a9e 5626static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5627{
e658983a 5628 struct p_req_state_reply *p = pi->data;
e4f78ede
PR
5629 int retcode = be32_to_cpu(p->retcode);
5630
5631 if (retcode >= SS_SUCCESS) {
bde89a9e 5632 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
e4f78ede 5633 } else {
bde89a9e 5634 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
1ec861eb 5635 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
e4f78ede
PR
5636 drbd_set_st_err_str(retcode), retcode);
5637 }
bde89a9e 5638 wake_up(&connection->ping_wait);
e4f78ede 5639
2735a594 5640 return 0;
e4f78ede 5641}
b411b363 5642
bde89a9e 5643static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5644{
9f4fe9ad 5645 struct drbd_peer_device *peer_device;
b30ab791 5646 struct drbd_device *device;
e658983a 5647 struct p_req_state_reply *p = pi->data;
b411b363
PR
5648 int retcode = be32_to_cpu(p->retcode);
5649
9f4fe9ad
AG
5650 peer_device = conn_peer_device(connection, pi->vnr);
5651 if (!peer_device)
2735a594 5652 return -EIO;
9f4fe9ad 5653 device = peer_device->device;
1952e916 5654
bde89a9e 5655 if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
0b0ba1ef 5656 D_ASSERT(device, connection->agreed_pro_version < 100);
bde89a9e 5657 return got_conn_RqSReply(connection, pi);
4d0fc3fd
PR
5658 }
5659
b411b363 5660 if (retcode >= SS_SUCCESS) {
b30ab791 5661 set_bit(CL_ST_CHG_SUCCESS, &device->flags);
b411b363 5662 } else {
b30ab791 5663 set_bit(CL_ST_CHG_FAIL, &device->flags);
d0180171 5664 drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
e4f78ede 5665 drbd_set_st_err_str(retcode), retcode);
b411b363 5666 }
b30ab791 5667 wake_up(&device->state_wait);
b411b363 5668
2735a594 5669 return 0;
b411b363
PR
5670}
5671
bde89a9e 5672static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5673{
bde89a9e 5674 return drbd_send_ping_ack(connection);
b411b363
PR
5675
5676}
5677
bde89a9e 5678static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363
PR
5679{
5680 /* restore idle timeout */
bde89a9e
AG
5681 connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5682 if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5683 wake_up(&connection->ping_wait);
b411b363 5684
2735a594 5685 return 0;
b411b363
PR
5686}
5687
bde89a9e 5688static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5689{
9f4fe9ad 5690 struct drbd_peer_device *peer_device;
b30ab791 5691 struct drbd_device *device;
e658983a 5692 struct p_block_ack *p = pi->data;
b411b363
PR
5693 sector_t sector = be64_to_cpu(p->sector);
5694 int blksize = be32_to_cpu(p->blksize);
5695
9f4fe9ad
AG
5696 peer_device = conn_peer_device(connection, pi->vnr);
5697 if (!peer_device)
2735a594 5698 return -EIO;
9f4fe9ad 5699 device = peer_device->device;
1952e916 5700
9f4fe9ad 5701 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
b411b363 5702
69a22773 5703 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5704
b30ab791
AG
5705 if (get_ldev(device)) {
5706 drbd_rs_complete_io(device, sector);
5707 drbd_set_in_sync(device, sector, blksize);
1d53f09e 5708 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
b30ab791
AG
5709 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5710 put_ldev(device);
1d53f09e 5711 }
b30ab791
AG
5712 dec_rs_pending(device);
5713 atomic_add(blksize >> 9, &device->rs_sect_in);
b411b363 5714
2735a594 5715 return 0;
b411b363
PR
5716}
5717
bc9c5c41 5718static int
b30ab791 5719validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
bc9c5c41
AG
5720 struct rb_root *root, const char *func,
5721 enum drbd_req_event what, bool missing_ok)
b411b363
PR
5722{
5723 struct drbd_request *req;
5724 struct bio_and_error m;
5725
0500813f 5726 spin_lock_irq(&device->resource->req_lock);
b30ab791 5727 req = find_request(device, root, id, sector, missing_ok, func);
b411b363 5728 if (unlikely(!req)) {
0500813f 5729 spin_unlock_irq(&device->resource->req_lock);
85997675 5730 return -EIO;
b411b363
PR
5731 }
5732 __req_mod(req, what, &m);
0500813f 5733 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
5734
5735 if (m.bio)
b30ab791 5736 complete_master_bio(device, &m);
85997675 5737 return 0;
b411b363
PR
5738}
5739
bde89a9e 5740static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5741{
9f4fe9ad 5742 struct drbd_peer_device *peer_device;
b30ab791 5743 struct drbd_device *device;
e658983a 5744 struct p_block_ack *p = pi->data;
b411b363
PR
5745 sector_t sector = be64_to_cpu(p->sector);
5746 int blksize = be32_to_cpu(p->blksize);
5747 enum drbd_req_event what;
5748
9f4fe9ad
AG
5749 peer_device = conn_peer_device(connection, pi->vnr);
5750 if (!peer_device)
2735a594 5751 return -EIO;
9f4fe9ad 5752 device = peer_device->device;
1952e916 5753
69a22773 5754 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5755
579b57ed 5756 if (p->block_id == ID_SYNCER) {
b30ab791
AG
5757 drbd_set_in_sync(device, sector, blksize);
5758 dec_rs_pending(device);
2735a594 5759 return 0;
b411b363 5760 }
e05e1e59 5761 switch (pi->cmd) {
b411b363 5762 case P_RS_WRITE_ACK:
8554df1c 5763 what = WRITE_ACKED_BY_PEER_AND_SIS;
b411b363
PR
5764 break;
5765 case P_WRITE_ACK:
8554df1c 5766 what = WRITE_ACKED_BY_PEER;
b411b363
PR
5767 break;
5768 case P_RECV_ACK:
8554df1c 5769 what = RECV_ACKED_BY_PEER;
b411b363 5770 break;
d4dabbe2
LE
5771 case P_SUPERSEDED:
5772 what = CONFLICT_RESOLVED;
b411b363 5773 break;
7be8da07 5774 case P_RETRY_WRITE:
7be8da07 5775 what = POSTPONE_WRITE;
b411b363
PR
5776 break;
5777 default:
2735a594 5778 BUG();
b411b363
PR
5779 }
5780
b30ab791
AG
5781 return validate_req_change_req_state(device, p->block_id, sector,
5782 &device->write_requests, __func__,
2735a594 5783 what, false);
b411b363
PR
5784}
5785
bde89a9e 5786static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5787{
9f4fe9ad 5788 struct drbd_peer_device *peer_device;
b30ab791 5789 struct drbd_device *device;
e658983a 5790 struct p_block_ack *p = pi->data;
b411b363 5791 sector_t sector = be64_to_cpu(p->sector);
2deb8336 5792 int size = be32_to_cpu(p->blksize);
85997675 5793 int err;
b411b363 5794
9f4fe9ad
AG
5795 peer_device = conn_peer_device(connection, pi->vnr);
5796 if (!peer_device)
2735a594 5797 return -EIO;
9f4fe9ad 5798 device = peer_device->device;
b411b363 5799
69a22773 5800 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5801
579b57ed 5802 if (p->block_id == ID_SYNCER) {
b30ab791
AG
5803 dec_rs_pending(device);
5804 drbd_rs_failed_io(device, sector, size);
2735a594 5805 return 0;
b411b363 5806 }
2deb8336 5807
b30ab791
AG
5808 err = validate_req_change_req_state(device, p->block_id, sector,
5809 &device->write_requests, __func__,
303d1448 5810 NEG_ACKED, true);
85997675 5811 if (err) {
c3afd8f5
AG
5812 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5813 The master bio might already be completed, therefore the
5814 request is no longer in the collision hash. */
5815 /* In Protocol B we might already have got a P_RECV_ACK
5816 but then get a P_NEG_ACK afterwards. */
b30ab791 5817 drbd_set_out_of_sync(device, sector, size);
2deb8336 5818 }
2735a594 5819 return 0;
b411b363
PR
5820}
5821
bde89a9e 5822static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5823{
9f4fe9ad 5824 struct drbd_peer_device *peer_device;
b30ab791 5825 struct drbd_device *device;
e658983a 5826 struct p_block_ack *p = pi->data;
b411b363
PR
5827 sector_t sector = be64_to_cpu(p->sector);
5828
9f4fe9ad
AG
5829 peer_device = conn_peer_device(connection, pi->vnr);
5830 if (!peer_device)
2735a594 5831 return -EIO;
9f4fe9ad 5832 device = peer_device->device;
1952e916 5833
69a22773 5834 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
7be8da07 5835
d0180171 5836 drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
b411b363
PR
5837 (unsigned long long)sector, be32_to_cpu(p->blksize));
5838
b30ab791
AG
5839 return validate_req_change_req_state(device, p->block_id, sector,
5840 &device->read_requests, __func__,
2735a594 5841 NEG_ACKED, false);
b411b363
PR
5842}
5843
bde89a9e 5844static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5845{
9f4fe9ad 5846 struct drbd_peer_device *peer_device;
b30ab791 5847 struct drbd_device *device;
b411b363
PR
5848 sector_t sector;
5849 int size;
e658983a 5850 struct p_block_ack *p = pi->data;
1952e916 5851
9f4fe9ad
AG
5852 peer_device = conn_peer_device(connection, pi->vnr);
5853 if (!peer_device)
2735a594 5854 return -EIO;
9f4fe9ad 5855 device = peer_device->device;
b411b363
PR
5856
5857 sector = be64_to_cpu(p->sector);
5858 size = be32_to_cpu(p->blksize);
b411b363 5859
69a22773 5860 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5861
b30ab791 5862 dec_rs_pending(device);
b411b363 5863
b30ab791
AG
5864 if (get_ldev_if_state(device, D_FAILED)) {
5865 drbd_rs_complete_io(device, sector);
e05e1e59 5866 switch (pi->cmd) {
d612d309 5867 case P_NEG_RS_DREPLY:
b30ab791 5868 drbd_rs_failed_io(device, sector, size);
d612d309
PR
5869 case P_RS_CANCEL:
5870 break;
5871 default:
2735a594 5872 BUG();
d612d309 5873 }
b30ab791 5874 put_ldev(device);
b411b363
PR
5875 }
5876
2735a594 5877 return 0;
b411b363
PR
5878}
5879
bde89a9e 5880static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5881{
e658983a 5882 struct p_barrier_ack *p = pi->data;
c06ece6b 5883 struct drbd_peer_device *peer_device;
9ed57dcb 5884 int vnr;
1952e916 5885
bde89a9e 5886 tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
b411b363 5887
9ed57dcb 5888 rcu_read_lock();
c06ece6b
AG
5889 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5890 struct drbd_device *device = peer_device->device;
5891
b30ab791
AG
5892 if (device->state.conn == C_AHEAD &&
5893 atomic_read(&device->ap_in_flight) == 0 &&
5894 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5895 device->start_resync_timer.expires = jiffies + HZ;
5896 add_timer(&device->start_resync_timer);
9ed57dcb 5897 }
c4752ef1 5898 }
9ed57dcb 5899 rcu_read_unlock();
c4752ef1 5900
2735a594 5901 return 0;
b411b363
PR
5902}
5903
bde89a9e 5904static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5905{
9f4fe9ad 5906 struct drbd_peer_device *peer_device;
b30ab791 5907 struct drbd_device *device;
e658983a 5908 struct p_block_ack *p = pi->data;
84b8c06b 5909 struct drbd_device_work *dw;
b411b363
PR
5910 sector_t sector;
5911 int size;
5912
9f4fe9ad
AG
5913 peer_device = conn_peer_device(connection, pi->vnr);
5914 if (!peer_device)
2735a594 5915 return -EIO;
9f4fe9ad 5916 device = peer_device->device;
1952e916 5917
b411b363
PR
5918 sector = be64_to_cpu(p->sector);
5919 size = be32_to_cpu(p->blksize);
5920
69a22773 5921 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363
PR
5922
5923 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
b30ab791 5924 drbd_ov_out_of_sync_found(device, sector, size);
b411b363 5925 else
b30ab791 5926 ov_out_of_sync_print(device);
b411b363 5927
b30ab791 5928 if (!get_ldev(device))
2735a594 5929 return 0;
1d53f09e 5930
b30ab791
AG
5931 drbd_rs_complete_io(device, sector);
5932 dec_rs_pending(device);
b411b363 5933
b30ab791 5934 --device->ov_left;
ea5442af
LE
5935
5936 /* let's advance progress step marks only for every other megabyte */
b30ab791
AG
5937 if ((device->ov_left & 0x200) == 0x200)
5938 drbd_advance_rs_marks(device, device->ov_left);
ea5442af 5939
b30ab791 5940 if (device->ov_left == 0) {
84b8c06b
AG
5941 dw = kmalloc(sizeof(*dw), GFP_NOIO);
5942 if (dw) {
5943 dw->w.cb = w_ov_finished;
5944 dw->device = device;
5945 drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
b411b363 5946 } else {
84b8c06b 5947 drbd_err(device, "kmalloc(dw) failed.");
b30ab791
AG
5948 ov_out_of_sync_print(device);
5949 drbd_resync_finished(device);
b411b363
PR
5950 }
5951 }
b30ab791 5952 put_ldev(device);
2735a594 5953 return 0;
b411b363
PR
5954}
5955
bde89a9e 5956static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
0ced55a3 5957{
2735a594 5958 return 0;
b411b363
PR
5959}
5960
668700b4
PR
5961struct meta_sock_cmd {
5962 size_t pkt_size;
5963 int (*fn)(struct drbd_connection *connection, struct packet_info *);
5964};
5965
5966static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
0ced55a3 5967{
668700b4
PR
5968 long t;
5969 struct net_conf *nc;
32862ec7 5970
668700b4
PR
5971 rcu_read_lock();
5972 nc = rcu_dereference(connection->net_conf);
5973 t = ping_timeout ? nc->ping_timeo : nc->ping_int;
5974 rcu_read_unlock();
c141ebda 5975
668700b4
PR
5976 t *= HZ;
5977 if (ping_timeout)
5978 t /= 10;
082a3439 5979
668700b4
PR
5980 connection->meta.socket->sk->sk_rcvtimeo = t;
5981}
32862ec7 5982
668700b4
PR
5983static void set_ping_timeout(struct drbd_connection *connection)
5984{
5985 set_rcvtimeo(connection, 1);
0ced55a3
PR
5986}
5987
668700b4
PR
5988static void set_idle_timeout(struct drbd_connection *connection)
5989{
5990 set_rcvtimeo(connection, 0);
5991}
b411b363 5992
668700b4 5993static struct meta_sock_cmd ack_receiver_tbl[] = {
e658983a
AG
5994 [P_PING] = { 0, got_Ping },
5995 [P_PING_ACK] = { 0, got_PingAck },
b411b363
PR
5996 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5997 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5998 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
d4dabbe2 5999 [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck },
b411b363
PR
6000 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
6001 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
1952e916 6002 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
b411b363
PR
6003 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
6004 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
6005 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
6006 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
02918be2 6007 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
1952e916
AG
6008 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
6009 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
6010 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
7201b972 6011};
b411b363 6012
1c03e520 6013int drbd_ack_receiver(struct drbd_thread *thi)
b411b363 6014{
bde89a9e 6015 struct drbd_connection *connection = thi->connection;
668700b4 6016 struct meta_sock_cmd *cmd = NULL;
77351055 6017 struct packet_info pi;
668700b4 6018 unsigned long pre_recv_jif;
257d0af6 6019 int rv;
bde89a9e 6020 void *buf = connection->meta.rbuf;
b411b363 6021 int received = 0;
bde89a9e 6022 unsigned int header_size = drbd_header_size(connection);
52b061a4 6023 int expect = header_size;
44ed167d 6024 bool ping_timeout_active = false;
3990e04d 6025 struct sched_param param = { .sched_priority = 2 };
b411b363 6026
3990e04d
PR
6027 rv = sched_setscheduler(current, SCHED_RR, &param);
6028 if (rv < 0)
668700b4 6029 drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
b411b363 6030
e77a0a5c 6031 while (get_t_state(thi) == RUNNING) {
80822284 6032 drbd_thread_current_set_cpu(thi);
b411b363 6033
668700b4 6034 conn_reclaim_net_peer_reqs(connection);
44ed167d 6035
bde89a9e
AG
6036 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
6037 if (drbd_send_ping(connection)) {
1ec861eb 6038 drbd_err(connection, "drbd_send_ping has failed\n");
b411b363 6039 goto reconnect;
841ce241 6040 }
668700b4 6041 set_ping_timeout(connection);
44ed167d 6042 ping_timeout_active = true;
b411b363
PR
6043 }
6044
668700b4 6045 pre_recv_jif = jiffies;
bde89a9e 6046 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
b411b363
PR
6047
6048 /* Note:
6049 * -EINTR (on meta) we got a signal
6050 * -EAGAIN (on meta) rcvtimeo expired
6051 * -ECONNRESET other side closed the connection
6052 * -ERESTARTSYS (on data) we got a signal
6053 * rv < 0 other than above: unexpected error!
6054 * rv == expected: full header or command
6055 * rv < expected: "woken" by signal during receive
6056 * rv == 0 : "connection shut down by peer"
6057 */
6058 if (likely(rv > 0)) {
6059 received += rv;
6060 buf += rv;
6061 } else if (rv == 0) {
bde89a9e 6062 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
b66623e3
PR
6063 long t;
6064 rcu_read_lock();
bde89a9e 6065 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
b66623e3
PR
6066 rcu_read_unlock();
6067
bde89a9e
AG
6068 t = wait_event_timeout(connection->ping_wait,
6069 connection->cstate < C_WF_REPORT_PARAMS,
b66623e3 6070 t);
599377ac
PR
6071 if (t)
6072 break;
6073 }
1ec861eb 6074 drbd_err(connection, "meta connection shut down by peer.\n");
b411b363
PR
6075 goto reconnect;
6076 } else if (rv == -EAGAIN) {
cb6518cb
LE
6077 /* If the data socket received something meanwhile,
6078 * that is good enough: peer is still alive. */
668700b4 6079 if (time_after(connection->last_received, pre_recv_jif))
cb6518cb 6080 continue;
f36af18c 6081 if (ping_timeout_active) {
1ec861eb 6082 drbd_err(connection, "PingAck did not arrive in time.\n");
b411b363
PR
6083 goto reconnect;
6084 }
bde89a9e 6085 set_bit(SEND_PING, &connection->flags);
b411b363
PR
6086 continue;
6087 } else if (rv == -EINTR) {
668700b4
PR
6088 /* maybe drbd_thread_stop(): the while condition will notice.
6089 * maybe woken for send_ping: we'll send a ping above,
6090 * and change the rcvtimeo */
6091 flush_signals(current);
b411b363
PR
6092 continue;
6093 } else {
1ec861eb 6094 drbd_err(connection, "sock_recvmsg returned %d\n", rv);
b411b363
PR
6095 goto reconnect;
6096 }
6097
6098 if (received == expect && cmd == NULL) {
bde89a9e 6099 if (decode_header(connection, connection->meta.rbuf, &pi))
b411b363 6100 goto reconnect;
668700b4
PR
6101 cmd = &ack_receiver_tbl[pi.cmd];
6102 if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
1ec861eb 6103 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
2fcb8f30 6104 cmdname(pi.cmd), pi.cmd);
b411b363
PR
6105 goto disconnect;
6106 }
e658983a 6107 expect = header_size + cmd->pkt_size;
52b061a4 6108 if (pi.size != expect - header_size) {
1ec861eb 6109 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
77351055 6110 pi.cmd, pi.size);
b411b363 6111 goto reconnect;
257d0af6 6112 }
b411b363
PR
6113 }
6114 if (received == expect) {
2735a594 6115 bool err;
a4fbda8e 6116
bde89a9e 6117 err = cmd->fn(connection, &pi);
2735a594 6118 if (err) {
1ec861eb 6119 drbd_err(connection, "%pf failed\n", cmd->fn);
b411b363 6120 goto reconnect;
1952e916 6121 }
b411b363 6122
bde89a9e 6123 connection->last_received = jiffies;
f36af18c 6124
668700b4
PR
6125 if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
6126 set_idle_timeout(connection);
44ed167d
PR
6127 ping_timeout_active = false;
6128 }
f36af18c 6129
bde89a9e 6130 buf = connection->meta.rbuf;
b411b363 6131 received = 0;
52b061a4 6132 expect = header_size;
b411b363
PR
6133 cmd = NULL;
6134 }
6135 }
6136
6137 if (0) {
6138reconnect:
bde89a9e
AG
6139 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6140 conn_md_sync(connection);
b411b363
PR
6141 }
6142 if (0) {
6143disconnect:
bde89a9e 6144 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 6145 }
b411b363 6146
668700b4 6147 drbd_info(connection, "ack_receiver terminated\n");
b411b363
PR
6148
6149 return 0;
6150}
668700b4
PR
6151
6152void drbd_send_acks_wf(struct work_struct *ws)
6153{
6154 struct drbd_peer_device *peer_device =
6155 container_of(ws, struct drbd_peer_device, send_acks_work);
6156 struct drbd_connection *connection = peer_device->connection;
6157 struct drbd_device *device = peer_device->device;
6158 struct net_conf *nc;
6159 int tcp_cork, err;
6160
6161 rcu_read_lock();
6162 nc = rcu_dereference(connection->net_conf);
6163 tcp_cork = nc->tcp_cork;
6164 rcu_read_unlock();
6165
6166 if (tcp_cork)
6167 drbd_tcp_cork(connection->meta.socket);
6168
6169 err = drbd_finish_peer_reqs(device);
6170 kref_put(&device->kref, drbd_destroy_device);
6171 /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
6172 struct work_struct send_acks_work alive, which is in the peer_device object */
6173
6174 if (err) {
6175 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6176 return;
6177 }
6178
6179 if (tcp_cork)
6180 drbd_tcp_uncork(connection->meta.socket);
6181
6182 return;
6183}