CIFS: SMBD: Support page offset in RDMA recv
[linux-2.6-block.git] / fs / cifs / smbdirect.c
CommitLineData
03bee01d
LL
1/*
2 * Copyright (C) 2017, Microsoft Corporation.
3 *
4 * Author(s): Long Li <longli@microsoft.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 */
f198186a 16#include <linux/module.h>
f64b78fd 17#include <linux/highmem.h>
03bee01d 18#include "smbdirect.h"
f198186a 19#include "cifs_debug.h"
b6903bcf 20#include "cifsproto.h"
f198186a
LL
21
22static struct smbd_response *get_empty_queue_buffer(
23 struct smbd_connection *info);
24static struct smbd_response *get_receive_buffer(
25 struct smbd_connection *info);
26static void put_receive_buffer(
27 struct smbd_connection *info,
28 struct smbd_response *response);
29static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
30static void destroy_receive_buffers(struct smbd_connection *info);
31
32static void put_empty_packet(
33 struct smbd_connection *info, struct smbd_response *response);
34static void enqueue_reassembly(
35 struct smbd_connection *info,
36 struct smbd_response *response, int data_length);
37static struct smbd_response *_get_first_reassembly(
38 struct smbd_connection *info);
39
40static int smbd_post_recv(
41 struct smbd_connection *info,
42 struct smbd_response *response);
43
44static int smbd_post_send_empty(struct smbd_connection *info);
d649e1bb
LL
45static int smbd_post_send_data(
46 struct smbd_connection *info,
47 struct kvec *iov, int n_vec, int remaining_data_length);
48static int smbd_post_send_page(struct smbd_connection *info,
49 struct page *page, unsigned long offset,
50 size_t size, int remaining_data_length);
03bee01d 51
c7398583
LL
52static void destroy_mr_list(struct smbd_connection *info);
53static int allocate_mr_list(struct smbd_connection *info);
54
03bee01d
LL
55/* SMBD version number */
56#define SMBD_V1 0x0100
57
58/* Port numbers for SMBD transport */
59#define SMB_PORT 445
60#define SMBD_PORT 5445
61
62/* Address lookup and resolve timeout in ms */
63#define RDMA_RESOLVE_TIMEOUT 5000
64
65/* SMBD negotiation timeout in seconds */
66#define SMBD_NEGOTIATE_TIMEOUT 120
67
68/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
69#define SMBD_MIN_RECEIVE_SIZE 128
70#define SMBD_MIN_FRAGMENTED_SIZE 131072
71
72/*
73 * Default maximum number of RDMA read/write outstanding on this connection
74 * This value is possibly decreased during QP creation on hardware limit
75 */
76#define SMBD_CM_RESPONDER_RESOURCES 32
77
78/* Maximum number of retries on data transfer operations */
79#define SMBD_CM_RETRY 6
80/* No need to retry on Receiver Not Ready since SMBD manages credits */
81#define SMBD_CM_RNR_RETRY 0
82
83/*
84 * User configurable initial values per SMBD transport connection
85 * as defined in [MS-SMBD] 3.1.1.1
86 * Those may change after a SMBD negotiation
87 */
88/* The local peer's maximum number of credits to grant to the peer */
89int smbd_receive_credit_max = 255;
90
91/* The remote peer's credit request of local peer */
92int smbd_send_credit_target = 255;
93
94/* The maximum single message size can be sent to remote peer */
95int smbd_max_send_size = 1364;
96
97/* The maximum fragmented upper-layer payload receive size supported */
98int smbd_max_fragmented_recv_size = 1024 * 1024;
99
100/* The maximum single-message size which can be received */
101int smbd_max_receive_size = 8192;
102
103/* The timeout to initiate send of a keepalive message on idle */
104int smbd_keep_alive_interval = 120;
105
106/*
107 * User configurable initial values for RDMA transport
108 * The actual values used may be lower and are limited to hardware capabilities
109 */
110/* Default maximum number of SGEs in a RDMA write/read */
111int smbd_max_frmr_depth = 2048;
112
113/* If payload is less than this byte, use RDMA send/recv not read/write */
114int rdma_readwrite_threshold = 4096;
f198186a
LL
115
116/* Transport logging functions
117 * Logging are defined as classes. They can be OR'ed to define the actual
118 * logging level via module parameter smbd_logging_class
119 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
120 * log_rdma_event()
121 */
122#define LOG_OUTGOING 0x1
123#define LOG_INCOMING 0x2
124#define LOG_READ 0x4
125#define LOG_WRITE 0x8
126#define LOG_RDMA_SEND 0x10
127#define LOG_RDMA_RECV 0x20
128#define LOG_KEEP_ALIVE 0x40
129#define LOG_RDMA_EVENT 0x80
130#define LOG_RDMA_MR 0x100
131static unsigned int smbd_logging_class;
132module_param(smbd_logging_class, uint, 0644);
133MODULE_PARM_DESC(smbd_logging_class,
134 "Logging class for SMBD transport 0x0 to 0x100");
135
136#define ERR 0x0
137#define INFO 0x1
138static unsigned int smbd_logging_level = ERR;
139module_param(smbd_logging_level, uint, 0644);
140MODULE_PARM_DESC(smbd_logging_level,
141 "Logging level for SMBD transport, 0 (default): error, 1: info");
142
143#define log_rdma(level, class, fmt, args...) \
144do { \
145 if (level <= smbd_logging_level || class & smbd_logging_class) \
146 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
147} while (0)
148
149#define log_outgoing(level, fmt, args...) \
150 log_rdma(level, LOG_OUTGOING, fmt, ##args)
151#define log_incoming(level, fmt, args...) \
152 log_rdma(level, LOG_INCOMING, fmt, ##args)
153#define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
154#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
155#define log_rdma_send(level, fmt, args...) \
156 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
157#define log_rdma_recv(level, fmt, args...) \
158 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
159#define log_keep_alive(level, fmt, args...) \
160 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
161#define log_rdma_event(level, fmt, args...) \
162 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
163#define log_rdma_mr(level, fmt, args...) \
164 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
165
166/*
167 * Destroy the transport and related RDMA and memory resources
168 * Need to go through all the pending counters and make sure on one is using
169 * the transport while it is destroyed
170 */
171static void smbd_destroy_rdma_work(struct work_struct *work)
172{
173 struct smbd_response *response;
174 struct smbd_connection *info =
175 container_of(work, struct smbd_connection, destroy_work);
176 unsigned long flags;
177
178 log_rdma_event(INFO, "destroying qp\n");
179 ib_drain_qp(info->id->qp);
180 rdma_destroy_qp(info->id);
181
182 /* Unblock all I/O waiting on the send queue */
183 wake_up_interruptible_all(&info->wait_send_queue);
184
185 log_rdma_event(INFO, "cancelling idle timer\n");
186 cancel_delayed_work_sync(&info->idle_timer_work);
187 log_rdma_event(INFO, "cancelling send immediate work\n");
188 cancel_delayed_work_sync(&info->send_immediate_work);
189
d649e1bb
LL
190 log_rdma_event(INFO, "wait for all send to finish\n");
191 wait_event(info->wait_smbd_send_pending,
192 info->smbd_send_pending == 0);
193
f198186a
LL
194 log_rdma_event(INFO, "wait for all recv to finish\n");
195 wake_up_interruptible(&info->wait_reassembly_queue);
f64b78fd
LL
196 wait_event(info->wait_smbd_recv_pending,
197 info->smbd_recv_pending == 0);
f198186a
LL
198
199 log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
200 wait_event(info->wait_send_pending,
201 atomic_read(&info->send_pending) == 0);
202 wait_event(info->wait_send_payload_pending,
203 atomic_read(&info->send_payload_pending) == 0);
204
c7398583
LL
205 log_rdma_event(INFO, "freeing mr list\n");
206 wake_up_interruptible_all(&info->wait_mr);
207 wait_event(info->wait_for_mr_cleanup,
208 atomic_read(&info->mr_used_count) == 0);
209 destroy_mr_list(info);
210
f198186a
LL
211 /* It's not posssible for upper layer to get to reassembly */
212 log_rdma_event(INFO, "drain the reassembly queue\n");
213 do {
214 spin_lock_irqsave(&info->reassembly_queue_lock, flags);
215 response = _get_first_reassembly(info);
216 if (response) {
217 list_del(&response->list);
218 spin_unlock_irqrestore(
219 &info->reassembly_queue_lock, flags);
220 put_receive_buffer(info, response);
f9de151b
SF
221 } else
222 spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
f198186a 223 } while (response);
f9de151b 224
f198186a
LL
225 info->reassembly_data_length = 0;
226
227 log_rdma_event(INFO, "free receive buffers\n");
228 wait_event(info->wait_receive_queues,
229 info->count_receive_queue + info->count_empty_packet_queue
230 == info->receive_credit_max);
231 destroy_receive_buffers(info);
232
233 ib_free_cq(info->send_cq);
234 ib_free_cq(info->recv_cq);
235 ib_dealloc_pd(info->pd);
236 rdma_destroy_id(info->id);
237
238 /* free mempools */
239 mempool_destroy(info->request_mempool);
240 kmem_cache_destroy(info->request_cache);
241
242 mempool_destroy(info->response_mempool);
243 kmem_cache_destroy(info->response_cache);
244
245 info->transport_status = SMBD_DESTROYED;
246 wake_up_all(&info->wait_destroy);
247}
248
249static int smbd_process_disconnected(struct smbd_connection *info)
250{
251 schedule_work(&info->destroy_work);
252 return 0;
253}
254
255static void smbd_disconnect_rdma_work(struct work_struct *work)
256{
257 struct smbd_connection *info =
258 container_of(work, struct smbd_connection, disconnect_work);
259
260 if (info->transport_status == SMBD_CONNECTED) {
261 info->transport_status = SMBD_DISCONNECTING;
262 rdma_disconnect(info->id);
263 }
264}
265
266static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
267{
268 queue_work(info->workqueue, &info->disconnect_work);
269}
270
271/* Upcall from RDMA CM */
272static int smbd_conn_upcall(
273 struct rdma_cm_id *id, struct rdma_cm_event *event)
274{
275 struct smbd_connection *info = id->context;
276
277 log_rdma_event(INFO, "event=%d status=%d\n",
278 event->event, event->status);
279
280 switch (event->event) {
281 case RDMA_CM_EVENT_ADDR_RESOLVED:
282 case RDMA_CM_EVENT_ROUTE_RESOLVED:
283 info->ri_rc = 0;
284 complete(&info->ri_done);
285 break;
286
287 case RDMA_CM_EVENT_ADDR_ERROR:
288 info->ri_rc = -EHOSTUNREACH;
289 complete(&info->ri_done);
290 break;
291
292 case RDMA_CM_EVENT_ROUTE_ERROR:
293 info->ri_rc = -ENETUNREACH;
294 complete(&info->ri_done);
295 break;
296
297 case RDMA_CM_EVENT_ESTABLISHED:
298 log_rdma_event(INFO, "connected event=%d\n", event->event);
299 info->transport_status = SMBD_CONNECTED;
300 wake_up_interruptible(&info->conn_wait);
301 break;
302
303 case RDMA_CM_EVENT_CONNECT_ERROR:
304 case RDMA_CM_EVENT_UNREACHABLE:
305 case RDMA_CM_EVENT_REJECTED:
306 log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
307 info->transport_status = SMBD_DISCONNECTED;
308 wake_up_interruptible(&info->conn_wait);
309 break;
310
311 case RDMA_CM_EVENT_DEVICE_REMOVAL:
312 case RDMA_CM_EVENT_DISCONNECTED:
313 /* This happenes when we fail the negotiation */
314 if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
315 info->transport_status = SMBD_DISCONNECTED;
316 wake_up(&info->conn_wait);
317 break;
318 }
319
320 info->transport_status = SMBD_DISCONNECTED;
321 smbd_process_disconnected(info);
322 break;
323
324 default:
325 break;
326 }
327
328 return 0;
329}
330
331/* Upcall from RDMA QP */
332static void
333smbd_qp_async_error_upcall(struct ib_event *event, void *context)
334{
335 struct smbd_connection *info = context;
336
337 log_rdma_event(ERR, "%s on device %s info %p\n",
338 ib_event_msg(event->event), event->device->name, info);
339
340 switch (event->event) {
341 case IB_EVENT_CQ_ERR:
342 case IB_EVENT_QP_FATAL:
343 smbd_disconnect_rdma_connection(info);
344
345 default:
346 break;
347 }
348}
349
350static inline void *smbd_request_payload(struct smbd_request *request)
351{
352 return (void *)request->packet;
353}
354
355static inline void *smbd_response_payload(struct smbd_response *response)
356{
357 return (void *)response->packet;
358}
359
360/* Called when a RDMA send is done */
361static void send_done(struct ib_cq *cq, struct ib_wc *wc)
362{
363 int i;
364 struct smbd_request *request =
365 container_of(wc->wr_cqe, struct smbd_request, cqe);
366
367 log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n",
368 request, wc->status);
369
370 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
371 log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
372 wc->status, wc->opcode);
373 smbd_disconnect_rdma_connection(request->info);
374 }
375
376 for (i = 0; i < request->num_sge; i++)
377 ib_dma_unmap_single(request->info->id->device,
378 request->sge[i].addr,
379 request->sge[i].length,
380 DMA_TO_DEVICE);
381
382 if (request->has_payload) {
383 if (atomic_dec_and_test(&request->info->send_payload_pending))
384 wake_up(&request->info->wait_send_payload_pending);
385 } else {
386 if (atomic_dec_and_test(&request->info->send_pending))
387 wake_up(&request->info->wait_send_pending);
388 }
389
390 mempool_free(request, request->info->request_mempool);
391}
392
393static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
394{
395 log_rdma_event(INFO, "resp message min_version %u max_version %u "
396 "negotiated_version %u credits_requested %u "
397 "credits_granted %u status %u max_readwrite_size %u "
398 "preferred_send_size %u max_receive_size %u "
399 "max_fragmented_size %u\n",
400 resp->min_version, resp->max_version, resp->negotiated_version,
401 resp->credits_requested, resp->credits_granted, resp->status,
402 resp->max_readwrite_size, resp->preferred_send_size,
403 resp->max_receive_size, resp->max_fragmented_size);
404}
405
406/*
407 * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
408 * response, packet_length: the negotiation response message
409 * return value: true if negotiation is a success, false if failed
410 */
411static bool process_negotiation_response(
412 struct smbd_response *response, int packet_length)
413{
414 struct smbd_connection *info = response->info;
415 struct smbd_negotiate_resp *packet = smbd_response_payload(response);
416
417 if (packet_length < sizeof(struct smbd_negotiate_resp)) {
418 log_rdma_event(ERR,
419 "error: packet_length=%d\n", packet_length);
420 return false;
421 }
422
423 if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
424 log_rdma_event(ERR, "error: negotiated_version=%x\n",
425 le16_to_cpu(packet->negotiated_version));
426 return false;
427 }
428 info->protocol = le16_to_cpu(packet->negotiated_version);
429
430 if (packet->credits_requested == 0) {
431 log_rdma_event(ERR, "error: credits_requested==0\n");
432 return false;
433 }
434 info->receive_credit_target = le16_to_cpu(packet->credits_requested);
435
436 if (packet->credits_granted == 0) {
437 log_rdma_event(ERR, "error: credits_granted==0\n");
438 return false;
439 }
440 atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
441
442 atomic_set(&info->receive_credits, 0);
443
444 if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
445 log_rdma_event(ERR, "error: preferred_send_size=%d\n",
446 le32_to_cpu(packet->preferred_send_size));
447 return false;
448 }
449 info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
450
451 if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
452 log_rdma_event(ERR, "error: max_receive_size=%d\n",
453 le32_to_cpu(packet->max_receive_size));
454 return false;
455 }
456 info->max_send_size = min_t(int, info->max_send_size,
457 le32_to_cpu(packet->max_receive_size));
458
459 if (le32_to_cpu(packet->max_fragmented_size) <
460 SMBD_MIN_FRAGMENTED_SIZE) {
461 log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
462 le32_to_cpu(packet->max_fragmented_size));
463 return false;
464 }
465 info->max_fragmented_send_size =
466 le32_to_cpu(packet->max_fragmented_size);
c7398583
LL
467 info->rdma_readwrite_threshold =
468 rdma_readwrite_threshold > info->max_fragmented_send_size ?
469 info->max_fragmented_send_size :
470 rdma_readwrite_threshold;
471
472
473 info->max_readwrite_size = min_t(u32,
474 le32_to_cpu(packet->max_readwrite_size),
475 info->max_frmr_depth * PAGE_SIZE);
476 info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
f198186a
LL
477
478 return true;
479}
480
481/*
482 * Check and schedule to send an immediate packet
483 * This is used to extend credtis to remote peer to keep the transport busy
484 */
485static void check_and_send_immediate(struct smbd_connection *info)
486{
487 if (info->transport_status != SMBD_CONNECTED)
488 return;
489
490 info->send_immediate = true;
491
492 /*
493 * Promptly send a packet if our peer is running low on receive
494 * credits
495 */
496 if (atomic_read(&info->receive_credits) <
497 info->receive_credit_target - 1)
498 queue_delayed_work(
499 info->workqueue, &info->send_immediate_work, 0);
500}
501
502static void smbd_post_send_credits(struct work_struct *work)
503{
504 int ret = 0;
505 int use_receive_queue = 1;
506 int rc;
507 struct smbd_response *response;
508 struct smbd_connection *info =
509 container_of(work, struct smbd_connection,
510 post_send_credits_work);
511
512 if (info->transport_status != SMBD_CONNECTED) {
513 wake_up(&info->wait_receive_queues);
514 return;
515 }
516
517 if (info->receive_credit_target >
518 atomic_read(&info->receive_credits)) {
519 while (true) {
520 if (use_receive_queue)
521 response = get_receive_buffer(info);
522 else
523 response = get_empty_queue_buffer(info);
524 if (!response) {
525 /* now switch to emtpy packet queue */
526 if (use_receive_queue) {
527 use_receive_queue = 0;
528 continue;
529 } else
530 break;
531 }
532
533 response->type = SMBD_TRANSFER_DATA;
534 response->first_segment = false;
535 rc = smbd_post_recv(info, response);
536 if (rc) {
537 log_rdma_recv(ERR,
538 "post_recv failed rc=%d\n", rc);
539 put_receive_buffer(info, response);
540 break;
541 }
542
543 ret++;
544 }
545 }
546
547 spin_lock(&info->lock_new_credits_offered);
548 info->new_credits_offered += ret;
549 spin_unlock(&info->lock_new_credits_offered);
550
551 atomic_add(ret, &info->receive_credits);
552
553 /* Check if we can post new receive and grant credits to peer */
554 check_and_send_immediate(info);
555}
556
557static void smbd_recv_done_work(struct work_struct *work)
558{
559 struct smbd_connection *info =
560 container_of(work, struct smbd_connection, recv_done_work);
561
562 /*
563 * We may have new send credits granted from remote peer
564 * If any sender is blcoked on lack of credets, unblock it
565 */
566 if (atomic_read(&info->send_credits))
567 wake_up_interruptible(&info->wait_send_queue);
568
569 /*
570 * Check if we need to send something to remote peer to
571 * grant more credits or respond to KEEP_ALIVE packet
572 */
573 check_and_send_immediate(info);
574}
575
576/* Called from softirq, when recv is done */
577static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
578{
579 struct smbd_data_transfer *data_transfer;
580 struct smbd_response *response =
581 container_of(wc->wr_cqe, struct smbd_response, cqe);
582 struct smbd_connection *info = response->info;
583 int data_length = 0;
584
585 log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d "
586 "byte_len=%d pkey_index=%x\n",
587 response, response->type, wc->status, wc->opcode,
588 wc->byte_len, wc->pkey_index);
589
590 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
591 log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
592 wc->status, wc->opcode);
593 smbd_disconnect_rdma_connection(info);
594 goto error;
595 }
596
597 ib_dma_sync_single_for_cpu(
598 wc->qp->device,
599 response->sge.addr,
600 response->sge.length,
601 DMA_FROM_DEVICE);
602
603 switch (response->type) {
604 /* SMBD negotiation response */
605 case SMBD_NEGOTIATE_RESP:
606 dump_smbd_negotiate_resp(smbd_response_payload(response));
607 info->full_packet_received = true;
608 info->negotiate_done =
609 process_negotiation_response(response, wc->byte_len);
610 complete(&info->negotiate_completion);
611 break;
612
613 /* SMBD data transfer packet */
614 case SMBD_TRANSFER_DATA:
615 data_transfer = smbd_response_payload(response);
616 data_length = le32_to_cpu(data_transfer->data_length);
617
618 /*
619 * If this is a packet with data playload place the data in
620 * reassembly queue and wake up the reading thread
621 */
622 if (data_length) {
623 if (info->full_packet_received)
624 response->first_segment = true;
625
626 if (le32_to_cpu(data_transfer->remaining_data_length))
627 info->full_packet_received = false;
628 else
629 info->full_packet_received = true;
630
631 enqueue_reassembly(
632 info,
633 response,
634 data_length);
635 } else
636 put_empty_packet(info, response);
637
638 if (data_length)
639 wake_up_interruptible(&info->wait_reassembly_queue);
640
641 atomic_dec(&info->receive_credits);
642 info->receive_credit_target =
643 le16_to_cpu(data_transfer->credits_requested);
644 atomic_add(le16_to_cpu(data_transfer->credits_granted),
645 &info->send_credits);
646
647 log_incoming(INFO, "data flags %d data_offset %d "
648 "data_length %d remaining_data_length %d\n",
649 le16_to_cpu(data_transfer->flags),
650 le32_to_cpu(data_transfer->data_offset),
651 le32_to_cpu(data_transfer->data_length),
652 le32_to_cpu(data_transfer->remaining_data_length));
653
654 /* Send a KEEP_ALIVE response right away if requested */
655 info->keep_alive_requested = KEEP_ALIVE_NONE;
656 if (le16_to_cpu(data_transfer->flags) &
657 SMB_DIRECT_RESPONSE_REQUESTED) {
658 info->keep_alive_requested = KEEP_ALIVE_PENDING;
659 }
660
661 queue_work(info->workqueue, &info->recv_done_work);
662 return;
663
664 default:
665 log_rdma_recv(ERR,
666 "unexpected response type=%d\n", response->type);
667 }
668
669error:
670 put_receive_buffer(info, response);
671}
672
673static struct rdma_cm_id *smbd_create_id(
674 struct smbd_connection *info,
675 struct sockaddr *dstaddr, int port)
676{
677 struct rdma_cm_id *id;
678 int rc;
679 __be16 *sport;
680
681 id = rdma_create_id(&init_net, smbd_conn_upcall, info,
682 RDMA_PS_TCP, IB_QPT_RC);
683 if (IS_ERR(id)) {
684 rc = PTR_ERR(id);
685 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
686 return id;
687 }
688
689 if (dstaddr->sa_family == AF_INET6)
690 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
691 else
692 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
693
694 *sport = htons(port);
695
696 init_completion(&info->ri_done);
697 info->ri_rc = -ETIMEDOUT;
698
699 rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
700 RDMA_RESOLVE_TIMEOUT);
701 if (rc) {
702 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
703 goto out;
704 }
705 wait_for_completion_interruptible_timeout(
706 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
707 rc = info->ri_rc;
708 if (rc) {
709 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
710 goto out;
711 }
712
713 info->ri_rc = -ETIMEDOUT;
714 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
715 if (rc) {
716 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
717 goto out;
718 }
719 wait_for_completion_interruptible_timeout(
720 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
721 rc = info->ri_rc;
722 if (rc) {
723 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
724 goto out;
725 }
726
727 return id;
728
729out:
730 rdma_destroy_id(id);
731 return ERR_PTR(rc);
732}
733
734/*
735 * Test if FRWR (Fast Registration Work Requests) is supported on the device
736 * This implementation requries FRWR on RDMA read/write
737 * return value: true if it is supported
738 */
739static bool frwr_is_supported(struct ib_device_attr *attrs)
740{
741 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
742 return false;
743 if (attrs->max_fast_reg_page_list_len == 0)
744 return false;
745 return true;
746}
747
748static int smbd_ia_open(
749 struct smbd_connection *info,
750 struct sockaddr *dstaddr, int port)
751{
752 int rc;
753
754 info->id = smbd_create_id(info, dstaddr, port);
755 if (IS_ERR(info->id)) {
756 rc = PTR_ERR(info->id);
757 goto out1;
758 }
759
760 if (!frwr_is_supported(&info->id->device->attrs)) {
761 log_rdma_event(ERR,
762 "Fast Registration Work Requests "
763 "(FRWR) is not supported\n");
764 log_rdma_event(ERR,
765 "Device capability flags = %llx "
766 "max_fast_reg_page_list_len = %u\n",
767 info->id->device->attrs.device_cap_flags,
768 info->id->device->attrs.max_fast_reg_page_list_len);
769 rc = -EPROTONOSUPPORT;
770 goto out2;
771 }
c7398583
LL
772 info->max_frmr_depth = min_t(int,
773 smbd_max_frmr_depth,
774 info->id->device->attrs.max_fast_reg_page_list_len);
775 info->mr_type = IB_MR_TYPE_MEM_REG;
776 if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
777 info->mr_type = IB_MR_TYPE_SG_GAPS;
f198186a
LL
778
779 info->pd = ib_alloc_pd(info->id->device, 0);
780 if (IS_ERR(info->pd)) {
781 rc = PTR_ERR(info->pd);
782 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
783 goto out2;
784 }
785
786 return 0;
787
788out2:
789 rdma_destroy_id(info->id);
790 info->id = NULL;
791
792out1:
793 return rc;
794}
795
796/*
797 * Send a negotiation request message to the peer
798 * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
799 * After negotiation, the transport is connected and ready for
800 * carrying upper layer SMB payload
801 */
802static int smbd_post_send_negotiate_req(struct smbd_connection *info)
803{
804 struct ib_send_wr send_wr, *send_wr_fail;
805 int rc = -ENOMEM;
806 struct smbd_request *request;
807 struct smbd_negotiate_req *packet;
808
809 request = mempool_alloc(info->request_mempool, GFP_KERNEL);
810 if (!request)
811 return rc;
812
813 request->info = info;
814
815 packet = smbd_request_payload(request);
816 packet->min_version = cpu_to_le16(SMBD_V1);
817 packet->max_version = cpu_to_le16(SMBD_V1);
818 packet->reserved = 0;
819 packet->credits_requested = cpu_to_le16(info->send_credit_target);
820 packet->preferred_send_size = cpu_to_le32(info->max_send_size);
821 packet->max_receive_size = cpu_to_le32(info->max_receive_size);
822 packet->max_fragmented_size =
823 cpu_to_le32(info->max_fragmented_recv_size);
824
825 request->num_sge = 1;
826 request->sge[0].addr = ib_dma_map_single(
827 info->id->device, (void *)packet,
828 sizeof(*packet), DMA_TO_DEVICE);
829 if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
830 rc = -EIO;
831 goto dma_mapping_failed;
832 }
833
834 request->sge[0].length = sizeof(*packet);
835 request->sge[0].lkey = info->pd->local_dma_lkey;
836
837 ib_dma_sync_single_for_device(
838 info->id->device, request->sge[0].addr,
839 request->sge[0].length, DMA_TO_DEVICE);
840
841 request->cqe.done = send_done;
842
843 send_wr.next = NULL;
844 send_wr.wr_cqe = &request->cqe;
845 send_wr.sg_list = request->sge;
846 send_wr.num_sge = request->num_sge;
847 send_wr.opcode = IB_WR_SEND;
848 send_wr.send_flags = IB_SEND_SIGNALED;
849
850 log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n",
851 request->sge[0].addr,
852 request->sge[0].length, request->sge[0].lkey);
853
854 request->has_payload = false;
855 atomic_inc(&info->send_pending);
856 rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail);
857 if (!rc)
858 return 0;
859
860 /* if we reach here, post send failed */
861 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
862 atomic_dec(&info->send_pending);
863 ib_dma_unmap_single(info->id->device, request->sge[0].addr,
864 request->sge[0].length, DMA_TO_DEVICE);
865
21a4e14a
LL
866 smbd_disconnect_rdma_connection(info);
867
f198186a
LL
868dma_mapping_failed:
869 mempool_free(request, info->request_mempool);
870 return rc;
871}
872
873/*
874 * Extend the credits to remote peer
875 * This implements [MS-SMBD] 3.1.5.9
876 * The idea is that we should extend credits to remote peer as quickly as
877 * it's allowed, to maintain data flow. We allocate as much receive
878 * buffer as possible, and extend the receive credits to remote peer
879 * return value: the new credtis being granted.
880 */
881static int manage_credits_prior_sending(struct smbd_connection *info)
882{
883 int new_credits;
884
885 spin_lock(&info->lock_new_credits_offered);
886 new_credits = info->new_credits_offered;
887 info->new_credits_offered = 0;
888 spin_unlock(&info->lock_new_credits_offered);
889
890 return new_credits;
891}
892
893/*
894 * Check if we need to send a KEEP_ALIVE message
895 * The idle connection timer triggers a KEEP_ALIVE message when expires
896 * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
897 * back a response.
898 * return value:
899 * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
900 * 0: otherwise
901 */
902static int manage_keep_alive_before_sending(struct smbd_connection *info)
903{
904 if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
905 info->keep_alive_requested = KEEP_ALIVE_SENT;
906 return 1;
907 }
908 return 0;
909}
910
911/*
912 * Build and prepare the SMBD packet header
913 * This function waits for avaialbe send credits and build a SMBD packet
914 * header. The caller then optional append payload to the packet after
915 * the header
916 * intput values
917 * size: the size of the payload
918 * remaining_data_length: remaining data to send if this is part of a
919 * fragmented packet
920 * output values
921 * request_out: the request allocated from this function
922 * return values: 0 on success, otherwise actual error code returned
923 */
924static int smbd_create_header(struct smbd_connection *info,
925 int size, int remaining_data_length,
926 struct smbd_request **request_out)
927{
928 struct smbd_request *request;
929 struct smbd_data_transfer *packet;
930 int header_length;
931 int rc;
932
933 /* Wait for send credits. A SMBD packet needs one credit */
934 rc = wait_event_interruptible(info->wait_send_queue,
935 atomic_read(&info->send_credits) > 0 ||
936 info->transport_status != SMBD_CONNECTED);
937 if (rc)
938 return rc;
939
940 if (info->transport_status != SMBD_CONNECTED) {
941 log_outgoing(ERR, "disconnected not sending\n");
942 return -ENOENT;
943 }
944 atomic_dec(&info->send_credits);
945
946 request = mempool_alloc(info->request_mempool, GFP_KERNEL);
947 if (!request) {
948 rc = -ENOMEM;
949 goto err;
950 }
951
952 request->info = info;
953
954 /* Fill in the packet header */
955 packet = smbd_request_payload(request);
956 packet->credits_requested = cpu_to_le16(info->send_credit_target);
957 packet->credits_granted =
958 cpu_to_le16(manage_credits_prior_sending(info));
959 info->send_immediate = false;
960
961 packet->flags = 0;
962 if (manage_keep_alive_before_sending(info))
963 packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
964
965 packet->reserved = 0;
966 if (!size)
967 packet->data_offset = 0;
968 else
969 packet->data_offset = cpu_to_le32(24);
970 packet->data_length = cpu_to_le32(size);
971 packet->remaining_data_length = cpu_to_le32(remaining_data_length);
972 packet->padding = 0;
973
974 log_outgoing(INFO, "credits_requested=%d credits_granted=%d "
975 "data_offset=%d data_length=%d remaining_data_length=%d\n",
976 le16_to_cpu(packet->credits_requested),
977 le16_to_cpu(packet->credits_granted),
978 le32_to_cpu(packet->data_offset),
979 le32_to_cpu(packet->data_length),
980 le32_to_cpu(packet->remaining_data_length));
981
982 /* Map the packet to DMA */
983 header_length = sizeof(struct smbd_data_transfer);
984 /* If this is a packet without payload, don't send padding */
985 if (!size)
986 header_length = offsetof(struct smbd_data_transfer, padding);
987
988 request->num_sge = 1;
989 request->sge[0].addr = ib_dma_map_single(info->id->device,
990 (void *)packet,
991 header_length,
992 DMA_BIDIRECTIONAL);
993 if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
994 mempool_free(request, info->request_mempool);
995 rc = -EIO;
996 goto err;
997 }
998
999 request->sge[0].length = header_length;
1000 request->sge[0].lkey = info->pd->local_dma_lkey;
1001
1002 *request_out = request;
1003 return 0;
1004
1005err:
1006 atomic_inc(&info->send_credits);
1007 return rc;
1008}
1009
1010static void smbd_destroy_header(struct smbd_connection *info,
1011 struct smbd_request *request)
1012{
1013
1014 ib_dma_unmap_single(info->id->device,
1015 request->sge[0].addr,
1016 request->sge[0].length,
1017 DMA_TO_DEVICE);
1018 mempool_free(request, info->request_mempool);
1019 atomic_inc(&info->send_credits);
1020}
1021
1022/* Post the send request */
1023static int smbd_post_send(struct smbd_connection *info,
1024 struct smbd_request *request, bool has_payload)
1025{
1026 struct ib_send_wr send_wr, *send_wr_fail;
1027 int rc, i;
1028
1029 for (i = 0; i < request->num_sge; i++) {
1030 log_rdma_send(INFO,
ac65cb62 1031 "rdma_request sge[%d] addr=%llu length=%u\n",
ff30b89e 1032 i, request->sge[i].addr, request->sge[i].length);
f198186a
LL
1033 ib_dma_sync_single_for_device(
1034 info->id->device,
1035 request->sge[i].addr,
1036 request->sge[i].length,
1037 DMA_TO_DEVICE);
1038 }
1039
1040 request->cqe.done = send_done;
1041
1042 send_wr.next = NULL;
1043 send_wr.wr_cqe = &request->cqe;
1044 send_wr.sg_list = request->sge;
1045 send_wr.num_sge = request->num_sge;
1046 send_wr.opcode = IB_WR_SEND;
1047 send_wr.send_flags = IB_SEND_SIGNALED;
1048
1049 if (has_payload) {
1050 request->has_payload = true;
1051 atomic_inc(&info->send_payload_pending);
1052 } else {
1053 request->has_payload = false;
1054 atomic_inc(&info->send_pending);
1055 }
1056
1057 rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail);
1058 if (rc) {
1059 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1060 if (has_payload) {
1061 if (atomic_dec_and_test(&info->send_payload_pending))
1062 wake_up(&info->wait_send_payload_pending);
1063 } else {
1064 if (atomic_dec_and_test(&info->send_pending))
1065 wake_up(&info->wait_send_pending);
1066 }
21a4e14a 1067 smbd_disconnect_rdma_connection(info);
f198186a
LL
1068 } else
1069 /* Reset timer for idle connection after packet is sent */
1070 mod_delayed_work(info->workqueue, &info->idle_timer_work,
1071 info->keep_alive_interval*HZ);
1072
1073 return rc;
1074}
1075
1076static int smbd_post_send_sgl(struct smbd_connection *info,
1077 struct scatterlist *sgl, int data_length, int remaining_data_length)
1078{
1079 int num_sgs;
1080 int i, rc;
1081 struct smbd_request *request;
1082 struct scatterlist *sg;
1083
1084 rc = smbd_create_header(
1085 info, data_length, remaining_data_length, &request);
1086 if (rc)
1087 return rc;
1088
1089 num_sgs = sgl ? sg_nents(sgl) : 0;
1090 for_each_sg(sgl, sg, num_sgs, i) {
1091 request->sge[i+1].addr =
1092 ib_dma_map_page(info->id->device, sg_page(sg),
1093 sg->offset, sg->length, DMA_BIDIRECTIONAL);
1094 if (ib_dma_mapping_error(
1095 info->id->device, request->sge[i+1].addr)) {
1096 rc = -EIO;
1097 request->sge[i+1].addr = 0;
1098 goto dma_mapping_failure;
1099 }
1100 request->sge[i+1].length = sg->length;
1101 request->sge[i+1].lkey = info->pd->local_dma_lkey;
1102 request->num_sge++;
1103 }
1104
1105 rc = smbd_post_send(info, request, data_length);
1106 if (!rc)
1107 return 0;
1108
1109dma_mapping_failure:
1110 for (i = 1; i < request->num_sge; i++)
1111 if (request->sge[i].addr)
1112 ib_dma_unmap_single(info->id->device,
1113 request->sge[i].addr,
1114 request->sge[i].length,
1115 DMA_TO_DEVICE);
1116 smbd_destroy_header(info, request);
1117 return rc;
1118}
1119
d649e1bb
LL
1120/*
1121 * Send a page
1122 * page: the page to send
1123 * offset: offset in the page to send
1124 * size: length in the page to send
1125 * remaining_data_length: remaining data to send in this payload
1126 */
1127static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
1128 unsigned long offset, size_t size, int remaining_data_length)
1129{
1130 struct scatterlist sgl;
1131
1132 sg_init_table(&sgl, 1);
1133 sg_set_page(&sgl, page, size, offset);
1134
1135 return smbd_post_send_sgl(info, &sgl, size, remaining_data_length);
1136}
1137
f198186a
LL
1138/*
1139 * Send an empty message
1140 * Empty message is used to extend credits to peer to for keep live
1141 * while there is no upper layer payload to send at the time
1142 */
1143static int smbd_post_send_empty(struct smbd_connection *info)
1144{
1145 info->count_send_empty++;
1146 return smbd_post_send_sgl(info, NULL, 0, 0);
1147}
1148
d649e1bb
LL
1149/*
1150 * Send a data buffer
1151 * iov: the iov array describing the data buffers
1152 * n_vec: number of iov array
1153 * remaining_data_length: remaining data to send following this packet
1154 * in segmented SMBD packet
1155 */
1156static int smbd_post_send_data(
1157 struct smbd_connection *info, struct kvec *iov, int n_vec,
1158 int remaining_data_length)
1159{
1160 int i;
1161 u32 data_length = 0;
1162 struct scatterlist sgl[SMBDIRECT_MAX_SGE];
1163
1164 if (n_vec > SMBDIRECT_MAX_SGE) {
1165 cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
1166 return -ENOMEM;
1167 }
1168
1169 sg_init_table(sgl, n_vec);
1170 for (i = 0; i < n_vec; i++) {
1171 data_length += iov[i].iov_len;
1172 sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len);
1173 }
1174
1175 return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length);
1176}
1177
f198186a
LL
1178/*
1179 * Post a receive request to the transport
1180 * The remote peer can only send data when a receive request is posted
1181 * The interaction is controlled by send/receive credit system
1182 */
1183static int smbd_post_recv(
1184 struct smbd_connection *info, struct smbd_response *response)
1185{
1186 struct ib_recv_wr recv_wr, *recv_wr_fail = NULL;
1187 int rc = -EIO;
1188
1189 response->sge.addr = ib_dma_map_single(
1190 info->id->device, response->packet,
1191 info->max_receive_size, DMA_FROM_DEVICE);
1192 if (ib_dma_mapping_error(info->id->device, response->sge.addr))
1193 return rc;
1194
1195 response->sge.length = info->max_receive_size;
1196 response->sge.lkey = info->pd->local_dma_lkey;
1197
1198 response->cqe.done = recv_done;
1199
1200 recv_wr.wr_cqe = &response->cqe;
1201 recv_wr.next = NULL;
1202 recv_wr.sg_list = &response->sge;
1203 recv_wr.num_sge = 1;
1204
1205 rc = ib_post_recv(info->id->qp, &recv_wr, &recv_wr_fail);
1206 if (rc) {
1207 ib_dma_unmap_single(info->id->device, response->sge.addr,
1208 response->sge.length, DMA_FROM_DEVICE);
21a4e14a 1209 smbd_disconnect_rdma_connection(info);
f198186a
LL
1210 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1211 }
1212
1213 return rc;
1214}
1215
1216/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
1217static int smbd_negotiate(struct smbd_connection *info)
1218{
1219 int rc;
1220 struct smbd_response *response = get_receive_buffer(info);
1221
1222 response->type = SMBD_NEGOTIATE_RESP;
1223 rc = smbd_post_recv(info, response);
1224 log_rdma_event(INFO,
1225 "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x "
1226 "iov.lkey=%x\n",
1227 rc, response->sge.addr,
1228 response->sge.length, response->sge.lkey);
1229 if (rc)
1230 return rc;
1231
1232 init_completion(&info->negotiate_completion);
1233 info->negotiate_done = false;
1234 rc = smbd_post_send_negotiate_req(info);
1235 if (rc)
1236 return rc;
1237
1238 rc = wait_for_completion_interruptible_timeout(
1239 &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
1240 log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
1241
1242 if (info->negotiate_done)
1243 return 0;
1244
1245 if (rc == 0)
1246 rc = -ETIMEDOUT;
1247 else if (rc == -ERESTARTSYS)
1248 rc = -EINTR;
1249 else
1250 rc = -ENOTCONN;
1251
1252 return rc;
1253}
1254
1255static void put_empty_packet(
1256 struct smbd_connection *info, struct smbd_response *response)
1257{
1258 spin_lock(&info->empty_packet_queue_lock);
1259 list_add_tail(&response->list, &info->empty_packet_queue);
1260 info->count_empty_packet_queue++;
1261 spin_unlock(&info->empty_packet_queue_lock);
1262
1263 queue_work(info->workqueue, &info->post_send_credits_work);
1264}
1265
1266/*
1267 * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1268 * This is a queue for reassembling upper layer payload and present to upper
1269 * layer. All the inncoming payload go to the reassembly queue, regardless of
1270 * if reassembly is required. The uuper layer code reads from the queue for all
1271 * incoming payloads.
1272 * Put a received packet to the reassembly queue
1273 * response: the packet received
1274 * data_length: the size of payload in this packet
1275 */
1276static void enqueue_reassembly(
1277 struct smbd_connection *info,
1278 struct smbd_response *response,
1279 int data_length)
1280{
1281 spin_lock(&info->reassembly_queue_lock);
1282 list_add_tail(&response->list, &info->reassembly_queue);
1283 info->reassembly_queue_length++;
1284 /*
1285 * Make sure reassembly_data_length is updated after list and
1286 * reassembly_queue_length are updated. On the dequeue side
1287 * reassembly_data_length is checked without a lock to determine
1288 * if reassembly_queue_length and list is up to date
1289 */
1290 virt_wmb();
1291 info->reassembly_data_length += data_length;
1292 spin_unlock(&info->reassembly_queue_lock);
1293 info->count_reassembly_queue++;
1294 info->count_enqueue_reassembly_queue++;
1295}
1296
1297/*
1298 * Get the first entry at the front of reassembly queue
1299 * Caller is responsible for locking
1300 * return value: the first entry if any, NULL if queue is empty
1301 */
1302static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
1303{
1304 struct smbd_response *ret = NULL;
1305
1306 if (!list_empty(&info->reassembly_queue)) {
1307 ret = list_first_entry(
1308 &info->reassembly_queue,
1309 struct smbd_response, list);
1310 }
1311 return ret;
1312}
1313
1314static struct smbd_response *get_empty_queue_buffer(
1315 struct smbd_connection *info)
1316{
1317 struct smbd_response *ret = NULL;
1318 unsigned long flags;
1319
1320 spin_lock_irqsave(&info->empty_packet_queue_lock, flags);
1321 if (!list_empty(&info->empty_packet_queue)) {
1322 ret = list_first_entry(
1323 &info->empty_packet_queue,
1324 struct smbd_response, list);
1325 list_del(&ret->list);
1326 info->count_empty_packet_queue--;
1327 }
1328 spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags);
1329
1330 return ret;
1331}
1332
1333/*
1334 * Get a receive buffer
1335 * For each remote send, we need to post a receive. The receive buffers are
1336 * pre-allocated in advance.
1337 * return value: the receive buffer, NULL if none is available
1338 */
1339static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
1340{
1341 struct smbd_response *ret = NULL;
1342 unsigned long flags;
1343
1344 spin_lock_irqsave(&info->receive_queue_lock, flags);
1345 if (!list_empty(&info->receive_queue)) {
1346 ret = list_first_entry(
1347 &info->receive_queue,
1348 struct smbd_response, list);
1349 list_del(&ret->list);
1350 info->count_receive_queue--;
1351 info->count_get_receive_buffer++;
1352 }
1353 spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1354
1355 return ret;
1356}
1357
1358/*
1359 * Return a receive buffer
1360 * Upon returning of a receive buffer, we can post new receive and extend
1361 * more receive credits to remote peer. This is done immediately after a
1362 * receive buffer is returned.
1363 */
1364static void put_receive_buffer(
1365 struct smbd_connection *info, struct smbd_response *response)
1366{
1367 unsigned long flags;
1368
1369 ib_dma_unmap_single(info->id->device, response->sge.addr,
1370 response->sge.length, DMA_FROM_DEVICE);
1371
1372 spin_lock_irqsave(&info->receive_queue_lock, flags);
1373 list_add_tail(&response->list, &info->receive_queue);
1374 info->count_receive_queue++;
1375 info->count_put_receive_buffer++;
1376 spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1377
1378 queue_work(info->workqueue, &info->post_send_credits_work);
1379}
1380
1381/* Preallocate all receive buffer on transport establishment */
1382static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
1383{
1384 int i;
1385 struct smbd_response *response;
1386
1387 INIT_LIST_HEAD(&info->reassembly_queue);
1388 spin_lock_init(&info->reassembly_queue_lock);
1389 info->reassembly_data_length = 0;
1390 info->reassembly_queue_length = 0;
1391
1392 INIT_LIST_HEAD(&info->receive_queue);
1393 spin_lock_init(&info->receive_queue_lock);
1394 info->count_receive_queue = 0;
1395
1396 INIT_LIST_HEAD(&info->empty_packet_queue);
1397 spin_lock_init(&info->empty_packet_queue_lock);
1398 info->count_empty_packet_queue = 0;
1399
1400 init_waitqueue_head(&info->wait_receive_queues);
1401
1402 for (i = 0; i < num_buf; i++) {
1403 response = mempool_alloc(info->response_mempool, GFP_KERNEL);
1404 if (!response)
1405 goto allocate_failed;
1406
1407 response->info = info;
1408 list_add_tail(&response->list, &info->receive_queue);
1409 info->count_receive_queue++;
1410 }
1411
1412 return 0;
1413
1414allocate_failed:
1415 while (!list_empty(&info->receive_queue)) {
1416 response = list_first_entry(
1417 &info->receive_queue,
1418 struct smbd_response, list);
1419 list_del(&response->list);
1420 info->count_receive_queue--;
1421
1422 mempool_free(response, info->response_mempool);
1423 }
1424 return -ENOMEM;
1425}
1426
1427static void destroy_receive_buffers(struct smbd_connection *info)
1428{
1429 struct smbd_response *response;
1430
1431 while ((response = get_receive_buffer(info)))
1432 mempool_free(response, info->response_mempool);
1433
1434 while ((response = get_empty_queue_buffer(info)))
1435 mempool_free(response, info->response_mempool);
1436}
1437
1438/*
1439 * Check and send an immediate or keep alive packet
1440 * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
1441 * Connection.KeepaliveRequested and Connection.SendImmediate
1442 * The idea is to extend credits to server as soon as it becomes available
1443 */
1444static void send_immediate_work(struct work_struct *work)
1445{
1446 struct smbd_connection *info = container_of(
1447 work, struct smbd_connection,
1448 send_immediate_work.work);
1449
1450 if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
1451 info->send_immediate) {
1452 log_keep_alive(INFO, "send an empty message\n");
1453 smbd_post_send_empty(info);
1454 }
1455}
1456
1457/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
1458static void idle_connection_timer(struct work_struct *work)
1459{
1460 struct smbd_connection *info = container_of(
1461 work, struct smbd_connection,
1462 idle_timer_work.work);
1463
1464 if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
1465 log_keep_alive(ERR,
1466 "error status info->keep_alive_requested=%d\n",
1467 info->keep_alive_requested);
1468 smbd_disconnect_rdma_connection(info);
1469 return;
1470 }
1471
1472 log_keep_alive(INFO, "about to send an empty idle message\n");
1473 smbd_post_send_empty(info);
1474
1475 /* Setup the next idle timeout work */
1476 queue_delayed_work(info->workqueue, &info->idle_timer_work,
1477 info->keep_alive_interval*HZ);
1478}
1479
8ef130f9
LL
1480/* Destroy this SMBD connection, called from upper layer */
1481void smbd_destroy(struct smbd_connection *info)
1482{
1483 log_rdma_event(INFO, "destroying rdma session\n");
1484
1485 /* Kick off the disconnection process */
1486 smbd_disconnect_rdma_connection(info);
1487
1488 log_rdma_event(INFO, "wait for transport being destroyed\n");
1489 wait_event(info->wait_destroy,
1490 info->transport_status == SMBD_DESTROYED);
1491
1492 destroy_workqueue(info->workqueue);
1493 kfree(info);
1494}
1495
ad57b8e1
LL
1496/*
1497 * Reconnect this SMBD connection, called from upper layer
1498 * return value: 0 on success, or actual error code
1499 */
1500int smbd_reconnect(struct TCP_Server_Info *server)
1501{
1502 log_rdma_event(INFO, "reconnecting rdma session\n");
1503
1504 if (!server->smbd_conn) {
48f238a7
LL
1505 log_rdma_event(INFO, "rdma session already destroyed\n");
1506 goto create_conn;
ad57b8e1
LL
1507 }
1508
1509 /*
1510 * This is possible if transport is disconnected and we haven't received
1511 * notification from RDMA, but upper layer has detected timeout
1512 */
1513 if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
1514 log_rdma_event(INFO, "disconnecting transport\n");
1515 smbd_disconnect_rdma_connection(server->smbd_conn);
1516 }
1517
1518 /* wait until the transport is destroyed */
48f238a7
LL
1519 if (!wait_event_timeout(server->smbd_conn->wait_destroy,
1520 server->smbd_conn->transport_status == SMBD_DESTROYED, 5*HZ))
1521 return -EAGAIN;
ad57b8e1
LL
1522
1523 destroy_workqueue(server->smbd_conn->workqueue);
1524 kfree(server->smbd_conn);
1525
48f238a7 1526create_conn:
ad57b8e1
LL
1527 log_rdma_event(INFO, "creating rdma session\n");
1528 server->smbd_conn = smbd_get_connection(
1529 server, (struct sockaddr *) &server->dstaddr);
48f238a7
LL
1530 log_rdma_event(INFO, "created rdma session info=%p\n",
1531 server->smbd_conn);
ad57b8e1
LL
1532
1533 return server->smbd_conn ? 0 : -ENOENT;
1534}
1535
f198186a
LL
1536static void destroy_caches_and_workqueue(struct smbd_connection *info)
1537{
1538 destroy_receive_buffers(info);
1539 destroy_workqueue(info->workqueue);
1540 mempool_destroy(info->response_mempool);
1541 kmem_cache_destroy(info->response_cache);
1542 mempool_destroy(info->request_mempool);
1543 kmem_cache_destroy(info->request_cache);
1544}
1545
1546#define MAX_NAME_LEN 80
1547static int allocate_caches_and_workqueue(struct smbd_connection *info)
1548{
1549 char name[MAX_NAME_LEN];
1550 int rc;
1551
1552 snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
1553 info->request_cache =
1554 kmem_cache_create(
1555 name,
1556 sizeof(struct smbd_request) +
1557 sizeof(struct smbd_data_transfer),
1558 0, SLAB_HWCACHE_ALIGN, NULL);
1559 if (!info->request_cache)
1560 return -ENOMEM;
1561
1562 info->request_mempool =
1563 mempool_create(info->send_credit_target, mempool_alloc_slab,
1564 mempool_free_slab, info->request_cache);
1565 if (!info->request_mempool)
1566 goto out1;
1567
1568 snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
1569 info->response_cache =
1570 kmem_cache_create(
1571 name,
1572 sizeof(struct smbd_response) +
1573 info->max_receive_size,
1574 0, SLAB_HWCACHE_ALIGN, NULL);
1575 if (!info->response_cache)
1576 goto out2;
1577
1578 info->response_mempool =
1579 mempool_create(info->receive_credit_max, mempool_alloc_slab,
1580 mempool_free_slab, info->response_cache);
1581 if (!info->response_mempool)
1582 goto out3;
1583
1584 snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
1585 info->workqueue = create_workqueue(name);
1586 if (!info->workqueue)
1587 goto out4;
1588
1589 rc = allocate_receive_buffers(info, info->receive_credit_max);
1590 if (rc) {
1591 log_rdma_event(ERR, "failed to allocate receive buffers\n");
1592 goto out5;
1593 }
1594
1595 return 0;
1596
1597out5:
1598 destroy_workqueue(info->workqueue);
1599out4:
1600 mempool_destroy(info->response_mempool);
1601out3:
1602 kmem_cache_destroy(info->response_cache);
1603out2:
1604 mempool_destroy(info->request_mempool);
1605out1:
1606 kmem_cache_destroy(info->request_cache);
1607 return -ENOMEM;
1608}
1609
1610/* Create a SMBD connection, called by upper layer */
9084432c 1611static struct smbd_connection *_smbd_get_connection(
f198186a
LL
1612 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1613{
1614 int rc;
1615 struct smbd_connection *info;
1616 struct rdma_conn_param conn_param;
1617 struct ib_qp_init_attr qp_attr;
1618 struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
c7398583
LL
1619 struct ib_port_immutable port_immutable;
1620 u32 ird_ord_hdr[2];
f198186a
LL
1621
1622 info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1623 if (!info)
1624 return NULL;
1625
1626 info->transport_status = SMBD_CONNECTING;
1627 rc = smbd_ia_open(info, dstaddr, port);
1628 if (rc) {
1629 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1630 goto create_id_failed;
1631 }
1632
1633 if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
1634 smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
1635 log_rdma_event(ERR,
1636 "consider lowering send_credit_target = %d. "
1637 "Possible CQE overrun, device "
1638 "reporting max_cpe %d max_qp_wr %d\n",
1639 smbd_send_credit_target,
1640 info->id->device->attrs.max_cqe,
1641 info->id->device->attrs.max_qp_wr);
1642 goto config_failed;
1643 }
1644
1645 if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
1646 smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
1647 log_rdma_event(ERR,
1648 "consider lowering receive_credit_max = %d. "
1649 "Possible CQE overrun, device "
1650 "reporting max_cpe %d max_qp_wr %d\n",
1651 smbd_receive_credit_max,
1652 info->id->device->attrs.max_cqe,
1653 info->id->device->attrs.max_qp_wr);
1654 goto config_failed;
1655 }
1656
1657 info->receive_credit_max = smbd_receive_credit_max;
1658 info->send_credit_target = smbd_send_credit_target;
1659 info->max_send_size = smbd_max_send_size;
1660 info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1661 info->max_receive_size = smbd_max_receive_size;
1662 info->keep_alive_interval = smbd_keep_alive_interval;
1663
1664 if (info->id->device->attrs.max_sge < SMBDIRECT_MAX_SGE) {
1665 log_rdma_event(ERR, "warning: device max_sge = %d too small\n",
1666 info->id->device->attrs.max_sge);
1667 log_rdma_event(ERR, "Queue Pair creation may fail\n");
1668 }
1669
1670 info->send_cq = NULL;
1671 info->recv_cq = NULL;
1672 info->send_cq = ib_alloc_cq(info->id->device, info,
1673 info->send_credit_target, 0, IB_POLL_SOFTIRQ);
1674 if (IS_ERR(info->send_cq)) {
1675 info->send_cq = NULL;
1676 goto alloc_cq_failed;
1677 }
1678
1679 info->recv_cq = ib_alloc_cq(info->id->device, info,
1680 info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
1681 if (IS_ERR(info->recv_cq)) {
1682 info->recv_cq = NULL;
1683 goto alloc_cq_failed;
1684 }
1685
1686 memset(&qp_attr, 0, sizeof(qp_attr));
1687 qp_attr.event_handler = smbd_qp_async_error_upcall;
1688 qp_attr.qp_context = info;
1689 qp_attr.cap.max_send_wr = info->send_credit_target;
1690 qp_attr.cap.max_recv_wr = info->receive_credit_max;
1691 qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE;
1692 qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE;
1693 qp_attr.cap.max_inline_data = 0;
1694 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1695 qp_attr.qp_type = IB_QPT_RC;
1696 qp_attr.send_cq = info->send_cq;
1697 qp_attr.recv_cq = info->recv_cq;
1698 qp_attr.port_num = ~0;
1699
1700 rc = rdma_create_qp(info->id, info->pd, &qp_attr);
1701 if (rc) {
1702 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1703 goto create_qp_failed;
1704 }
1705
1706 memset(&conn_param, 0, sizeof(conn_param));
1707 conn_param.initiator_depth = 0;
1708
c7398583
LL
1709 conn_param.responder_resources =
1710 info->id->device->attrs.max_qp_rd_atom
1711 < SMBD_CM_RESPONDER_RESOURCES ?
1712 info->id->device->attrs.max_qp_rd_atom :
1713 SMBD_CM_RESPONDER_RESOURCES;
1714 info->responder_resources = conn_param.responder_resources;
1715 log_rdma_mr(INFO, "responder_resources=%d\n",
1716 info->responder_resources);
1717
1718 /* Need to send IRD/ORD in private data for iWARP */
1719 info->id->device->get_port_immutable(
1720 info->id->device, info->id->port_num, &port_immutable);
1721 if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1722 ird_ord_hdr[0] = info->responder_resources;
1723 ird_ord_hdr[1] = 1;
1724 conn_param.private_data = ird_ord_hdr;
1725 conn_param.private_data_len = sizeof(ird_ord_hdr);
1726 } else {
1727 conn_param.private_data = NULL;
1728 conn_param.private_data_len = 0;
1729 }
1730
f198186a
LL
1731 conn_param.retry_count = SMBD_CM_RETRY;
1732 conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1733 conn_param.flow_control = 0;
1734 init_waitqueue_head(&info->wait_destroy);
1735
1736 log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1737 &addr_in->sin_addr, port);
1738
1739 init_waitqueue_head(&info->conn_wait);
1740 rc = rdma_connect(info->id, &conn_param);
1741 if (rc) {
1742 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1743 goto rdma_connect_failed;
1744 }
1745
1746 wait_event_interruptible(
1747 info->conn_wait, info->transport_status != SMBD_CONNECTING);
1748
1749 if (info->transport_status != SMBD_CONNECTED) {
1750 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1751 goto rdma_connect_failed;
1752 }
1753
1754 log_rdma_event(INFO, "rdma_connect connected\n");
1755
1756 rc = allocate_caches_and_workqueue(info);
1757 if (rc) {
1758 log_rdma_event(ERR, "cache allocation failed\n");
1759 goto allocate_cache_failed;
1760 }
1761
1762 init_waitqueue_head(&info->wait_send_queue);
1763 init_waitqueue_head(&info->wait_reassembly_queue);
1764
1765 INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
1766 INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
1767 queue_delayed_work(info->workqueue, &info->idle_timer_work,
1768 info->keep_alive_interval*HZ);
1769
d649e1bb
LL
1770 init_waitqueue_head(&info->wait_smbd_send_pending);
1771 info->smbd_send_pending = 0;
1772
f64b78fd
LL
1773 init_waitqueue_head(&info->wait_smbd_recv_pending);
1774 info->smbd_recv_pending = 0;
1775
f198186a
LL
1776 init_waitqueue_head(&info->wait_send_pending);
1777 atomic_set(&info->send_pending, 0);
1778
1779 init_waitqueue_head(&info->wait_send_payload_pending);
1780 atomic_set(&info->send_payload_pending, 0);
1781
1782 INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
1783 INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work);
1784 INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
1785 INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
1786 info->new_credits_offered = 0;
1787 spin_lock_init(&info->lock_new_credits_offered);
1788
1789 rc = smbd_negotiate(info);
1790 if (rc) {
1791 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1792 goto negotiation_failed;
1793 }
1794
c7398583
LL
1795 rc = allocate_mr_list(info);
1796 if (rc) {
1797 log_rdma_mr(ERR, "memory registration allocation failed\n");
1798 goto allocate_mr_failed;
1799 }
1800
f198186a
LL
1801 return info;
1802
c7398583
LL
1803allocate_mr_failed:
1804 /* At this point, need to a full transport shutdown */
1805 smbd_destroy(info);
1806 return NULL;
1807
f198186a
LL
1808negotiation_failed:
1809 cancel_delayed_work_sync(&info->idle_timer_work);
1810 destroy_caches_and_workqueue(info);
1811 info->transport_status = SMBD_NEGOTIATE_FAILED;
1812 init_waitqueue_head(&info->conn_wait);
1813 rdma_disconnect(info->id);
1814 wait_event(info->conn_wait,
1815 info->transport_status == SMBD_DISCONNECTED);
1816
1817allocate_cache_failed:
1818rdma_connect_failed:
1819 rdma_destroy_qp(info->id);
1820
1821create_qp_failed:
1822alloc_cq_failed:
1823 if (info->send_cq)
1824 ib_free_cq(info->send_cq);
1825 if (info->recv_cq)
1826 ib_free_cq(info->recv_cq);
1827
1828config_failed:
1829 ib_dealloc_pd(info->pd);
1830 rdma_destroy_id(info->id);
1831
1832create_id_failed:
1833 kfree(info);
1834 return NULL;
1835}
399f9539
LL
1836
1837struct smbd_connection *smbd_get_connection(
1838 struct TCP_Server_Info *server, struct sockaddr *dstaddr)
1839{
1840 struct smbd_connection *ret;
1841 int port = SMBD_PORT;
1842
1843try_again:
1844 ret = _smbd_get_connection(server, dstaddr, port);
1845
1846 /* Try SMB_PORT if SMBD_PORT doesn't work */
1847 if (!ret && port == SMBD_PORT) {
1848 port = SMB_PORT;
1849 goto try_again;
1850 }
1851 return ret;
1852}
f64b78fd
LL
1853
1854/*
1855 * Receive data from receive reassembly queue
1856 * All the incoming data packets are placed in reassembly queue
1857 * buf: the buffer to read data into
1858 * size: the length of data to read
1859 * return value: actual data read
1860 * Note: this implementation copies the data from reassebmly queue to receive
1861 * buffers used by upper layer. This is not the optimal code path. A better way
1862 * to do it is to not have upper layer allocate its receive buffers but rather
1863 * borrow the buffer from reassembly queue, and return it after data is
1864 * consumed. But this will require more changes to upper layer code, and also
1865 * need to consider packet boundaries while they still being reassembled.
1866 */
2026b06e
SF
1867static int smbd_recv_buf(struct smbd_connection *info, char *buf,
1868 unsigned int size)
f64b78fd
LL
1869{
1870 struct smbd_response *response;
1871 struct smbd_data_transfer *data_transfer;
1872 int to_copy, to_read, data_read, offset;
1873 u32 data_length, remaining_data_length, data_offset;
1874 int rc;
f64b78fd
LL
1875
1876again:
1877 if (info->transport_status != SMBD_CONNECTED) {
1878 log_read(ERR, "disconnected\n");
1879 return -ENODEV;
1880 }
1881
1882 /*
1883 * No need to hold the reassembly queue lock all the time as we are
1884 * the only one reading from the front of the queue. The transport
1885 * may add more entries to the back of the queue at the same time
1886 */
1887 log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
1888 info->reassembly_data_length);
1889 if (info->reassembly_data_length >= size) {
1890 int queue_length;
1891 int queue_removed = 0;
1892
1893 /*
1894 * Need to make sure reassembly_data_length is read before
1895 * reading reassembly_queue_length and calling
1896 * _get_first_reassembly. This call is lock free
1897 * as we never read at the end of the queue which are being
1898 * updated in SOFTIRQ as more data is received
1899 */
1900 virt_rmb();
1901 queue_length = info->reassembly_queue_length;
1902 data_read = 0;
1903 to_read = size;
1904 offset = info->first_entry_offset;
1905 while (data_read < size) {
1906 response = _get_first_reassembly(info);
1907 data_transfer = smbd_response_payload(response);
1908 data_length = le32_to_cpu(data_transfer->data_length);
1909 remaining_data_length =
1910 le32_to_cpu(
1911 data_transfer->remaining_data_length);
1912 data_offset = le32_to_cpu(data_transfer->data_offset);
1913
1914 /*
1915 * The upper layer expects RFC1002 length at the
1916 * beginning of the payload. Return it to indicate
1917 * the total length of the packet. This minimize the
1918 * change to upper layer packet processing logic. This
1919 * will be eventually remove when an intermediate
1920 * transport layer is added
1921 */
1922 if (response->first_segment && size == 4) {
1923 unsigned int rfc1002_len =
1924 data_length + remaining_data_length;
1925 *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
1926 data_read = 4;
1927 response->first_segment = false;
1928 log_read(INFO, "returning rfc1002 length %d\n",
1929 rfc1002_len);
1930 goto read_rfc1002_done;
1931 }
1932
1933 to_copy = min_t(int, data_length - offset, to_read);
1934 memcpy(
1935 buf + data_read,
1936 (char *)data_transfer + data_offset + offset,
1937 to_copy);
1938
1939 /* move on to the next buffer? */
1940 if (to_copy == data_length - offset) {
1941 queue_length--;
1942 /*
1943 * No need to lock if we are not at the
1944 * end of the queue
1945 */
f9de151b
SF
1946 if (queue_length)
1947 list_del(&response->list);
1948 else {
e36c048a
AB
1949 spin_lock_irq(
1950 &info->reassembly_queue_lock);
f9de151b 1951 list_del(&response->list);
e36c048a
AB
1952 spin_unlock_irq(
1953 &info->reassembly_queue_lock);
f9de151b
SF
1954 }
1955 queue_removed++;
f64b78fd
LL
1956 info->count_reassembly_queue--;
1957 info->count_dequeue_reassembly_queue++;
1958 put_receive_buffer(info, response);
1959 offset = 0;
1960 log_read(INFO, "put_receive_buffer offset=0\n");
1961 } else
1962 offset += to_copy;
1963
1964 to_read -= to_copy;
1965 data_read += to_copy;
1966
1967 log_read(INFO, "_get_first_reassembly memcpy %d bytes "
1968 "data_transfer_length-offset=%d after that "
1969 "to_read=%d data_read=%d offset=%d\n",
1970 to_copy, data_length - offset,
1971 to_read, data_read, offset);
1972 }
1973
e36c048a 1974 spin_lock_irq(&info->reassembly_queue_lock);
f64b78fd
LL
1975 info->reassembly_data_length -= data_read;
1976 info->reassembly_queue_length -= queue_removed;
e36c048a 1977 spin_unlock_irq(&info->reassembly_queue_lock);
f64b78fd
LL
1978
1979 info->first_entry_offset = offset;
1980 log_read(INFO, "returning to thread data_read=%d "
1981 "reassembly_data_length=%d first_entry_offset=%d\n",
1982 data_read, info->reassembly_data_length,
1983 info->first_entry_offset);
1984read_rfc1002_done:
1985 return data_read;
1986 }
1987
1988 log_read(INFO, "wait_event on more data\n");
1989 rc = wait_event_interruptible(
1990 info->wait_reassembly_queue,
1991 info->reassembly_data_length >= size ||
1992 info->transport_status != SMBD_CONNECTED);
1993 /* Don't return any data if interrupted */
1994 if (rc)
1995 return -ENODEV;
1996
1997 goto again;
1998}
1999
2000/*
2001 * Receive a page from receive reassembly queue
2002 * page: the page to read data into
2003 * to_read: the length of data to read
2004 * return value: actual data read
2005 */
2026b06e 2006static int smbd_recv_page(struct smbd_connection *info,
6509f50c
LL
2007 struct page *page, unsigned int page_offset,
2008 unsigned int to_read)
f64b78fd
LL
2009{
2010 int ret;
2011 char *to_address;
6509f50c 2012 void *page_address;
f64b78fd
LL
2013
2014 /* make sure we have the page ready for read */
2015 ret = wait_event_interruptible(
2016 info->wait_reassembly_queue,
2017 info->reassembly_data_length >= to_read ||
2018 info->transport_status != SMBD_CONNECTED);
2019 if (ret)
6509f50c 2020 return ret;
f64b78fd
LL
2021
2022 /* now we can read from reassembly queue and not sleep */
6509f50c
LL
2023 page_address = kmap_atomic(page);
2024 to_address = (char *) page_address + page_offset;
f64b78fd
LL
2025
2026 log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
2027 page, to_address, to_read);
2028
2029 ret = smbd_recv_buf(info, to_address, to_read);
6509f50c 2030 kunmap_atomic(page_address);
f64b78fd
LL
2031
2032 return ret;
2033}
2034
2035/*
2036 * Receive data from transport
2037 * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
2038 * return: total bytes read, or 0. SMB Direct will not do partial read.
2039 */
2040int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
2041{
2042 char *buf;
2043 struct page *page;
6509f50c 2044 unsigned int to_read, page_offset;
f64b78fd
LL
2045 int rc;
2046
2047 info->smbd_recv_pending++;
2048
2049 switch (msg->msg_iter.type) {
2050 case READ | ITER_KVEC:
2051 buf = msg->msg_iter.kvec->iov_base;
2052 to_read = msg->msg_iter.kvec->iov_len;
2053 rc = smbd_recv_buf(info, buf, to_read);
2054 break;
2055
2056 case READ | ITER_BVEC:
2057 page = msg->msg_iter.bvec->bv_page;
6509f50c 2058 page_offset = msg->msg_iter.bvec->bv_offset;
f64b78fd 2059 to_read = msg->msg_iter.bvec->bv_len;
6509f50c 2060 rc = smbd_recv_page(info, page, page_offset, to_read);
f64b78fd
LL
2061 break;
2062
2063 default:
2064 /* It's a bug in upper layer to get there */
2065 cifs_dbg(VFS, "CIFS: invalid msg type %d\n",
2066 msg->msg_iter.type);
6509f50c 2067 rc = -EINVAL;
f64b78fd
LL
2068 }
2069
2070 info->smbd_recv_pending--;
2071 wake_up(&info->wait_smbd_recv_pending);
2072
2073 /* SMBDirect will read it all or nothing */
2074 if (rc > 0)
2075 msg->msg_iter.count = 0;
2076 return rc;
2077}
d649e1bb
LL
2078
2079/*
2080 * Send data to transport
2081 * Each rqst is transported as a SMBDirect payload
2082 * rqst: the data to write
2083 * return value: 0 if successfully write, otherwise error code
2084 */
2085int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
2086{
2087 struct kvec vec;
2088 int nvecs;
2089 int size;
b6903bcf 2090 unsigned int buflen = 0, remaining_data_length;
d649e1bb
LL
2091 int start, i, j;
2092 int max_iov_size =
2093 info->max_send_size - sizeof(struct smbd_data_transfer);
8bcda1d2 2094 struct kvec *iov;
d649e1bb
LL
2095 int rc;
2096
2097 info->smbd_send_pending++;
2098 if (info->transport_status != SMBD_CONNECTED) {
2099 rc = -ENODEV;
2100 goto done;
2101 }
2102
2103 /*
8bcda1d2
LL
2104 * Skip the RFC1002 length defined in MS-SMB2 section 2.1
2105 * It is used only for TCP transport in the iov[0]
d649e1bb
LL
2106 * In future we may want to add a transport layer under protocol
2107 * layer so this will only be issued to TCP transport
2108 */
8bcda1d2
LL
2109
2110 if (rqst->rq_iov[0].iov_len != 4) {
2111 log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len);
2112 return -EINVAL;
2113 }
2114 iov = &rqst->rq_iov[1];
d649e1bb
LL
2115
2116 /* total up iov array first */
8bcda1d2 2117 for (i = 0; i < rqst->rq_nvec-1; i++) {
d649e1bb
LL
2118 buflen += iov[i].iov_len;
2119 }
2120
b6903bcf
LL
2121 /*
2122 * Add in the page array if there is one. The caller needs to set
2123 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
2124 * ends at page boundary
2125 */
d649e1bb 2126 if (rqst->rq_npages) {
b6903bcf
LL
2127 if (rqst->rq_npages == 1)
2128 buflen += rqst->rq_tailsz;
2129 else
2130 buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) -
2131 rqst->rq_offset + rqst->rq_tailsz;
d649e1bb
LL
2132 }
2133
2134 if (buflen + sizeof(struct smbd_data_transfer) >
2135 info->max_fragmented_send_size) {
2136 log_write(ERR, "payload size %d > max size %d\n",
2137 buflen, info->max_fragmented_send_size);
2138 rc = -EINVAL;
2139 goto done;
2140 }
2141
ff30b89e
LL
2142 cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen);
2143 for (i = 0; i < rqst->rq_nvec-1; i++)
2144 dump_smb(iov[i].iov_base, iov[i].iov_len);
2145
d649e1bb
LL
2146 remaining_data_length = buflen;
2147
2148 log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
2149 "rq_tailsz=%d buflen=%d\n",
2150 rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
2151 rqst->rq_tailsz, buflen);
2152
2153 start = i = iov[0].iov_len ? 0 : 1;
2154 buflen = 0;
2155 while (true) {
2156 buflen += iov[i].iov_len;
2157 if (buflen > max_iov_size) {
2158 if (i > start) {
2159 remaining_data_length -=
2160 (buflen-iov[i].iov_len);
2161 log_write(INFO, "sending iov[] from start=%d "
2162 "i=%d nvecs=%d "
2163 "remaining_data_length=%d\n",
2164 start, i, i-start,
2165 remaining_data_length);
2166 rc = smbd_post_send_data(
2167 info, &iov[start], i-start,
2168 remaining_data_length);
2169 if (rc)
2170 goto done;
2171 } else {
2172 /* iov[start] is too big, break it */
2173 nvecs = (buflen+max_iov_size-1)/max_iov_size;
2174 log_write(INFO, "iov[%d] iov_base=%p buflen=%d"
2175 " break to %d vectors\n",
2176 start, iov[start].iov_base,
2177 buflen, nvecs);
2178 for (j = 0; j < nvecs; j++) {
2179 vec.iov_base =
2180 (char *)iov[start].iov_base +
2181 j*max_iov_size;
2182 vec.iov_len = max_iov_size;
2183 if (j == nvecs-1)
2184 vec.iov_len =
2185 buflen -
2186 max_iov_size*(nvecs-1);
2187 remaining_data_length -= vec.iov_len;
2188 log_write(INFO,
2189 "sending vec j=%d iov_base=%p"
2190 " iov_len=%zu "
2191 "remaining_data_length=%d\n",
2192 j, vec.iov_base, vec.iov_len,
2193 remaining_data_length);
2194 rc = smbd_post_send_data(
2195 info, &vec, 1,
2196 remaining_data_length);
2197 if (rc)
2198 goto done;
2199 }
2200 i++;
8bcda1d2 2201 if (i == rqst->rq_nvec-1)
ab60ee7b 2202 break;
d649e1bb
LL
2203 }
2204 start = i;
2205 buflen = 0;
2206 } else {
2207 i++;
8bcda1d2 2208 if (i == rqst->rq_nvec-1) {
d649e1bb
LL
2209 /* send out all remaining vecs */
2210 remaining_data_length -= buflen;
2211 log_write(INFO,
2212 "sending iov[] from start=%d i=%d "
2213 "nvecs=%d remaining_data_length=%d\n",
2214 start, i, i-start,
2215 remaining_data_length);
2216 rc = smbd_post_send_data(info, &iov[start],
2217 i-start, remaining_data_length);
2218 if (rc)
2219 goto done;
2220 break;
2221 }
2222 }
2223 log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
2224 }
2225
2226 /* now sending pages if there are any */
2227 for (i = 0; i < rqst->rq_npages; i++) {
b6903bcf
LL
2228 unsigned int offset;
2229
2230 rqst_page_get_length(rqst, i, &buflen, &offset);
d649e1bb
LL
2231 nvecs = (buflen + max_iov_size - 1) / max_iov_size;
2232 log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
2233 buflen, nvecs);
2234 for (j = 0; j < nvecs; j++) {
2235 size = max_iov_size;
2236 if (j == nvecs-1)
2237 size = buflen - j*max_iov_size;
2238 remaining_data_length -= size;
2239 log_write(INFO, "sending pages i=%d offset=%d size=%d"
2240 " remaining_data_length=%d\n",
b6903bcf
LL
2241 i, j*max_iov_size+offset, size,
2242 remaining_data_length);
d649e1bb 2243 rc = smbd_post_send_page(
b6903bcf
LL
2244 info, rqst->rq_pages[i],
2245 j*max_iov_size + offset,
d649e1bb
LL
2246 size, remaining_data_length);
2247 if (rc)
2248 goto done;
2249 }
2250 }
2251
2252done:
2253 /*
2254 * As an optimization, we don't wait for individual I/O to finish
2255 * before sending the next one.
2256 * Send them all and wait for pending send count to get to 0
2257 * that means all the I/Os have been out and we are good to return
2258 */
2259
2260 wait_event(info->wait_send_payload_pending,
2261 atomic_read(&info->send_payload_pending) == 0);
2262
2263 info->smbd_send_pending--;
2264 wake_up(&info->wait_smbd_send_pending);
2265
2266 return rc;
2267}
c7398583
LL
2268
2269static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
2270{
2271 struct smbd_mr *mr;
2272 struct ib_cqe *cqe;
2273
2274 if (wc->status) {
2275 log_rdma_mr(ERR, "status=%d\n", wc->status);
2276 cqe = wc->wr_cqe;
2277 mr = container_of(cqe, struct smbd_mr, cqe);
2278 smbd_disconnect_rdma_connection(mr->conn);
2279 }
2280}
2281
2282/*
2283 * The work queue function that recovers MRs
2284 * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2285 * again. Both calls are slow, so finish them in a workqueue. This will not
2286 * block I/O path.
2287 * There is one workqueue that recovers MRs, there is no need to lock as the
2288 * I/O requests calling smbd_register_mr will never update the links in the
2289 * mr_list.
2290 */
2291static void smbd_mr_recovery_work(struct work_struct *work)
2292{
2293 struct smbd_connection *info =
2294 container_of(work, struct smbd_connection, mr_recovery_work);
2295 struct smbd_mr *smbdirect_mr;
2296 int rc;
2297
2298 list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
2299 if (smbdirect_mr->state == MR_INVALIDATED ||
2300 smbdirect_mr->state == MR_ERROR) {
2301
2302 if (smbdirect_mr->state == MR_INVALIDATED) {
2303 ib_dma_unmap_sg(
2304 info->id->device, smbdirect_mr->sgl,
2305 smbdirect_mr->sgl_count,
2306 smbdirect_mr->dir);
2307 smbdirect_mr->state = MR_READY;
2308 } else if (smbdirect_mr->state == MR_ERROR) {
2309
2310 /* recover this MR entry */
2311 rc = ib_dereg_mr(smbdirect_mr->mr);
2312 if (rc) {
2313 log_rdma_mr(ERR,
ac65cb62 2314 "ib_dereg_mr failed rc=%x\n",
c7398583
LL
2315 rc);
2316 smbd_disconnect_rdma_connection(info);
2317 }
2318
2319 smbdirect_mr->mr = ib_alloc_mr(
2320 info->pd, info->mr_type,
2321 info->max_frmr_depth);
2322 if (IS_ERR(smbdirect_mr->mr)) {
2323 log_rdma_mr(ERR,
2324 "ib_alloc_mr failed mr_type=%x "
2325 "max_frmr_depth=%x\n",
2326 info->mr_type,
2327 info->max_frmr_depth);
2328 smbd_disconnect_rdma_connection(info);
2329 }
2330
2331 smbdirect_mr->state = MR_READY;
2332 }
2333 /* smbdirect_mr->state is updated by this function
2334 * and is read and updated by I/O issuing CPUs trying
2335 * to get a MR, the call to atomic_inc_return
2336 * implicates a memory barrier and guarantees this
2337 * value is updated before waking up any calls to
2338 * get_mr() from the I/O issuing CPUs
2339 */
2340 if (atomic_inc_return(&info->mr_ready_count) == 1)
2341 wake_up_interruptible(&info->wait_mr);
2342 }
2343 }
2344}
2345
2346static void destroy_mr_list(struct smbd_connection *info)
2347{
2348 struct smbd_mr *mr, *tmp;
2349
2350 cancel_work_sync(&info->mr_recovery_work);
2351 list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
2352 if (mr->state == MR_INVALIDATED)
2353 ib_dma_unmap_sg(info->id->device, mr->sgl,
2354 mr->sgl_count, mr->dir);
2355 ib_dereg_mr(mr->mr);
2356 kfree(mr->sgl);
2357 kfree(mr);
2358 }
2359}
2360
2361/*
2362 * Allocate MRs used for RDMA read/write
2363 * The number of MRs will not exceed hardware capability in responder_resources
2364 * All MRs are kept in mr_list. The MR can be recovered after it's used
2365 * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2366 * as MRs are used and recovered for I/O, but the list links will not change
2367 */
2368static int allocate_mr_list(struct smbd_connection *info)
2369{
2370 int i;
2371 struct smbd_mr *smbdirect_mr, *tmp;
2372
2373 INIT_LIST_HEAD(&info->mr_list);
2374 init_waitqueue_head(&info->wait_mr);
2375 spin_lock_init(&info->mr_list_lock);
2376 atomic_set(&info->mr_ready_count, 0);
2377 atomic_set(&info->mr_used_count, 0);
2378 init_waitqueue_head(&info->wait_for_mr_cleanup);
2379 /* Allocate more MRs (2x) than hardware responder_resources */
2380 for (i = 0; i < info->responder_resources * 2; i++) {
2381 smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
2382 if (!smbdirect_mr)
2383 goto out;
2384 smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
2385 info->max_frmr_depth);
2386 if (IS_ERR(smbdirect_mr->mr)) {
2387 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x "
2388 "max_frmr_depth=%x\n",
2389 info->mr_type, info->max_frmr_depth);
2390 goto out;
2391 }
2392 smbdirect_mr->sgl = kcalloc(
2393 info->max_frmr_depth,
2394 sizeof(struct scatterlist),
2395 GFP_KERNEL);
2396 if (!smbdirect_mr->sgl) {
2397 log_rdma_mr(ERR, "failed to allocate sgl\n");
2398 ib_dereg_mr(smbdirect_mr->mr);
2399 goto out;
2400 }
2401 smbdirect_mr->state = MR_READY;
2402 smbdirect_mr->conn = info;
2403
2404 list_add_tail(&smbdirect_mr->list, &info->mr_list);
2405 atomic_inc(&info->mr_ready_count);
2406 }
2407 INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
2408 return 0;
2409
2410out:
2411 kfree(smbdirect_mr);
2412
2413 list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
2414 ib_dereg_mr(smbdirect_mr->mr);
2415 kfree(smbdirect_mr->sgl);
2416 kfree(smbdirect_mr);
2417 }
2418 return -ENOMEM;
2419}
2420
2421/*
2422 * Get a MR from mr_list. This function waits until there is at least one
2423 * MR available in the list. It may access the list while the
2424 * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2425 * as they never modify the same places. However, there may be several CPUs
2426 * issueing I/O trying to get MR at the same time, mr_list_lock is used to
2427 * protect this situation.
2428 */
2429static struct smbd_mr *get_mr(struct smbd_connection *info)
2430{
2431 struct smbd_mr *ret;
2432 int rc;
2433again:
2434 rc = wait_event_interruptible(info->wait_mr,
2435 atomic_read(&info->mr_ready_count) ||
2436 info->transport_status != SMBD_CONNECTED);
2437 if (rc) {
2438 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2439 return NULL;
2440 }
2441
2442 if (info->transport_status != SMBD_CONNECTED) {
2443 log_rdma_mr(ERR, "info->transport_status=%x\n",
2444 info->transport_status);
2445 return NULL;
2446 }
2447
2448 spin_lock(&info->mr_list_lock);
2449 list_for_each_entry(ret, &info->mr_list, list) {
2450 if (ret->state == MR_READY) {
2451 ret->state = MR_REGISTERED;
2452 spin_unlock(&info->mr_list_lock);
2453 atomic_dec(&info->mr_ready_count);
2454 atomic_inc(&info->mr_used_count);
2455 return ret;
2456 }
2457 }
2458
2459 spin_unlock(&info->mr_list_lock);
2460 /*
2461 * It is possible that we could fail to get MR because other processes may
2462 * try to acquire a MR at the same time. If this is the case, retry it.
2463 */
2464 goto again;
2465}
2466
2467/*
2468 * Register memory for RDMA read/write
2469 * pages[]: the list of pages to register memory with
2470 * num_pages: the number of pages to register
2471 * tailsz: if non-zero, the bytes to register in the last page
2472 * writing: true if this is a RDMA write (SMB read), false for RDMA read
2473 * need_invalidate: true if this MR needs to be locally invalidated after I/O
2474 * return value: the MR registered, NULL if failed.
2475 */
2476struct smbd_mr *smbd_register_mr(
2477 struct smbd_connection *info, struct page *pages[], int num_pages,
2478 int tailsz, bool writing, bool need_invalidate)
2479{
2480 struct smbd_mr *smbdirect_mr;
2481 int rc, i;
2482 enum dma_data_direction dir;
2483 struct ib_reg_wr *reg_wr;
2484 struct ib_send_wr *bad_wr;
2485
2486 if (num_pages > info->max_frmr_depth) {
2487 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2488 num_pages, info->max_frmr_depth);
2489 return NULL;
2490 }
2491
2492 smbdirect_mr = get_mr(info);
2493 if (!smbdirect_mr) {
2494 log_rdma_mr(ERR, "get_mr returning NULL\n");
2495 return NULL;
2496 }
2497 smbdirect_mr->need_invalidate = need_invalidate;
2498 smbdirect_mr->sgl_count = num_pages;
2499 sg_init_table(smbdirect_mr->sgl, num_pages);
2500
2501 for (i = 0; i < num_pages - 1; i++)
2502 sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0);
2503
2504 sg_set_page(&smbdirect_mr->sgl[i], pages[i],
2505 tailsz ? tailsz : PAGE_SIZE, 0);
2506
2507 dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2508 smbdirect_mr->dir = dir;
2509 rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir);
2510 if (!rc) {
2511 log_rdma_mr(INFO, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2512 num_pages, dir, rc);
2513 goto dma_map_error;
2514 }
2515
2516 rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages,
2517 NULL, PAGE_SIZE);
2518 if (rc != num_pages) {
2519 log_rdma_mr(INFO,
2520 "ib_map_mr_sg failed rc = %x num_pages = %x\n",
2521 rc, num_pages);
2522 goto map_mr_error;
2523 }
2524
2525 ib_update_fast_reg_key(smbdirect_mr->mr,
2526 ib_inc_rkey(smbdirect_mr->mr->rkey));
2527 reg_wr = &smbdirect_mr->wr;
2528 reg_wr->wr.opcode = IB_WR_REG_MR;
2529 smbdirect_mr->cqe.done = register_mr_done;
2530 reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
2531 reg_wr->wr.num_sge = 0;
2532 reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2533 reg_wr->mr = smbdirect_mr->mr;
2534 reg_wr->key = smbdirect_mr->mr->rkey;
2535 reg_wr->access = writing ?
2536 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2537 IB_ACCESS_REMOTE_READ;
2538
2539 /*
2540 * There is no need for waiting for complemtion on ib_post_send
2541 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2542 * on the next ib_post_send when we actaully send I/O to remote peer
2543 */
2544 rc = ib_post_send(info->id->qp, &reg_wr->wr, &bad_wr);
2545 if (!rc)
2546 return smbdirect_mr;
2547
2548 log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2549 rc, reg_wr->key);
2550
2551 /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
2552map_mr_error:
2553 ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl,
2554 smbdirect_mr->sgl_count, smbdirect_mr->dir);
2555
2556dma_map_error:
2557 smbdirect_mr->state = MR_ERROR;
2558 if (atomic_dec_and_test(&info->mr_used_count))
2559 wake_up(&info->wait_for_mr_cleanup);
2560
21a4e14a
LL
2561 smbd_disconnect_rdma_connection(info);
2562
c7398583
LL
2563 return NULL;
2564}
2565
2566static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2567{
2568 struct smbd_mr *smbdirect_mr;
2569 struct ib_cqe *cqe;
2570
2571 cqe = wc->wr_cqe;
2572 smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
2573 smbdirect_mr->state = MR_INVALIDATED;
2574 if (wc->status != IB_WC_SUCCESS) {
2575 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2576 smbdirect_mr->state = MR_ERROR;
2577 }
2578 complete(&smbdirect_mr->invalidate_done);
2579}
2580
2581/*
2582 * Deregister a MR after I/O is done
2583 * This function may wait if remote invalidation is not used
2584 * and we have to locally invalidate the buffer to prevent data is being
2585 * modified by remote peer after upper layer consumes it
2586 */
2587int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
2588{
2589 struct ib_send_wr *wr, *bad_wr;
2590 struct smbd_connection *info = smbdirect_mr->conn;
2591 int rc = 0;
2592
2593 if (smbdirect_mr->need_invalidate) {
2594 /* Need to finish local invalidation before returning */
2595 wr = &smbdirect_mr->inv_wr;
2596 wr->opcode = IB_WR_LOCAL_INV;
2597 smbdirect_mr->cqe.done = local_inv_done;
2598 wr->wr_cqe = &smbdirect_mr->cqe;
2599 wr->num_sge = 0;
2600 wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
2601 wr->send_flags = IB_SEND_SIGNALED;
2602
2603 init_completion(&smbdirect_mr->invalidate_done);
2604 rc = ib_post_send(info->id->qp, wr, &bad_wr);
2605 if (rc) {
2606 log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2607 smbd_disconnect_rdma_connection(info);
2608 goto done;
2609 }
2610 wait_for_completion(&smbdirect_mr->invalidate_done);
2611 smbdirect_mr->need_invalidate = false;
2612 } else
2613 /*
2614 * For remote invalidation, just set it to MR_INVALIDATED
2615 * and defer to mr_recovery_work to recover the MR for next use
2616 */
2617 smbdirect_mr->state = MR_INVALIDATED;
2618
2619 /*
2620 * Schedule the work to do MR recovery for future I/Os
2621 * MR recovery is slow and we don't want it to block the current I/O
2622 */
2623 queue_work(info->workqueue, &info->mr_recovery_work);
2624
2625done:
2626 if (atomic_dec_and_test(&info->mr_used_count))
2627 wake_up(&info->wait_for_mr_cleanup);
2628
2629 return rc;
2630}