Commit | Line | Data |
---|---|---|
6c52fdc2 BM |
1 | // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause |
2 | ||
3 | /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ | |
4 | /* Fredy Neeser */ | |
5 | /* Greg Joyce <greg@opengridcomputing.com> */ | |
6 | /* Copyright (c) 2008-2019, IBM Corporation */ | |
7 | /* Copyright (c) 2017, Open Grid Computing, Inc. */ | |
8 | ||
9 | #include <linux/errno.h> | |
10 | #include <linux/types.h> | |
11 | #include <linux/net.h> | |
12 | #include <linux/inetdevice.h> | |
13 | #include <net/addrconf.h> | |
14 | #include <linux/workqueue.h> | |
15 | #include <net/sock.h> | |
16 | #include <net/tcp.h> | |
17 | #include <linux/inet.h> | |
18 | #include <linux/tcp.h> | |
19 | ||
20 | #include <rdma/iw_cm.h> | |
21 | #include <rdma/ib_verbs.h> | |
22 | #include <rdma/ib_user_verbs.h> | |
23 | ||
24 | #include "siw.h" | |
25 | #include "siw_cm.h" | |
26 | ||
27 | /* | |
28 | * Set to any combination of | |
29 | * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR | |
30 | */ | |
31 | static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; | |
32 | static const bool relaxed_ird_negotiation = 1; | |
33 | ||
34 | static void siw_cm_llp_state_change(struct sock *s); | |
35 | static void siw_cm_llp_data_ready(struct sock *s); | |
36 | static void siw_cm_llp_write_space(struct sock *s); | |
37 | static void siw_cm_llp_error_report(struct sock *s); | |
38 | static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, | |
39 | int status); | |
40 | ||
41 | static void siw_sk_assign_cm_upcalls(struct sock *sk) | |
42 | { | |
43 | write_lock_bh(&sk->sk_callback_lock); | |
44 | sk->sk_state_change = siw_cm_llp_state_change; | |
45 | sk->sk_data_ready = siw_cm_llp_data_ready; | |
46 | sk->sk_write_space = siw_cm_llp_write_space; | |
47 | sk->sk_error_report = siw_cm_llp_error_report; | |
48 | write_unlock_bh(&sk->sk_callback_lock); | |
49 | } | |
50 | ||
51 | static void siw_sk_save_upcalls(struct sock *sk) | |
52 | { | |
53 | struct siw_cep *cep = sk_to_cep(sk); | |
54 | ||
55 | write_lock_bh(&sk->sk_callback_lock); | |
56 | cep->sk_state_change = sk->sk_state_change; | |
57 | cep->sk_data_ready = sk->sk_data_ready; | |
58 | cep->sk_write_space = sk->sk_write_space; | |
59 | cep->sk_error_report = sk->sk_error_report; | |
60 | write_unlock_bh(&sk->sk_callback_lock); | |
61 | } | |
62 | ||
63 | static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) | |
64 | { | |
65 | sk->sk_state_change = cep->sk_state_change; | |
66 | sk->sk_data_ready = cep->sk_data_ready; | |
67 | sk->sk_write_space = cep->sk_write_space; | |
68 | sk->sk_error_report = cep->sk_error_report; | |
69 | sk->sk_user_data = NULL; | |
70 | } | |
71 | ||
72 | static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) | |
73 | { | |
74 | struct socket *s = cep->sock; | |
75 | struct sock *sk = s->sk; | |
76 | ||
77 | write_lock_bh(&sk->sk_callback_lock); | |
78 | ||
79 | qp->attrs.sk = s; | |
80 | sk->sk_data_ready = siw_qp_llp_data_ready; | |
81 | sk->sk_write_space = siw_qp_llp_write_space; | |
82 | ||
83 | write_unlock_bh(&sk->sk_callback_lock); | |
84 | } | |
85 | ||
86 | static void siw_socket_disassoc(struct socket *s) | |
87 | { | |
88 | struct sock *sk = s->sk; | |
89 | struct siw_cep *cep; | |
90 | ||
91 | if (sk) { | |
92 | write_lock_bh(&sk->sk_callback_lock); | |
93 | cep = sk_to_cep(sk); | |
94 | if (cep) { | |
95 | siw_sk_restore_upcalls(sk, cep); | |
96 | siw_cep_put(cep); | |
97 | } else { | |
98 | pr_warn("siw: cannot restore sk callbacks: no ep\n"); | |
99 | } | |
100 | write_unlock_bh(&sk->sk_callback_lock); | |
101 | } else { | |
102 | pr_warn("siw: cannot restore sk callbacks: no sk\n"); | |
103 | } | |
104 | } | |
105 | ||
106 | static void siw_rtr_data_ready(struct sock *sk) | |
107 | { | |
108 | struct siw_cep *cep; | |
109 | struct siw_qp *qp = NULL; | |
110 | read_descriptor_t rd_desc; | |
111 | ||
112 | read_lock(&sk->sk_callback_lock); | |
113 | ||
114 | cep = sk_to_cep(sk); | |
115 | if (!cep) { | |
116 | WARN(1, "No connection endpoint\n"); | |
117 | goto out; | |
118 | } | |
119 | qp = sk_to_qp(sk); | |
120 | ||
121 | memset(&rd_desc, 0, sizeof(rd_desc)); | |
122 | rd_desc.arg.data = qp; | |
123 | rd_desc.count = 1; | |
124 | ||
125 | tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); | |
126 | /* | |
127 | * Check if first frame was successfully processed. | |
128 | * Signal connection full establishment if yes. | |
129 | * Failed data processing would have already scheduled | |
130 | * connection drop. | |
131 | */ | |
132 | if (!qp->rx_stream.rx_suspend) | |
133 | siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); | |
134 | out: | |
135 | read_unlock(&sk->sk_callback_lock); | |
136 | if (qp) | |
137 | siw_qp_socket_assoc(cep, qp); | |
138 | } | |
139 | ||
140 | static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) | |
141 | { | |
142 | struct sock *sk = cep->sock->sk; | |
143 | ||
144 | write_lock_bh(&sk->sk_callback_lock); | |
145 | sk->sk_data_ready = siw_rtr_data_ready; | |
146 | sk->sk_write_space = siw_qp_llp_write_space; | |
147 | write_unlock_bh(&sk->sk_callback_lock); | |
148 | } | |
149 | ||
150 | static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) | |
151 | { | |
152 | cep->sock = s; | |
153 | siw_cep_get(cep); | |
154 | s->sk->sk_user_data = cep; | |
155 | ||
156 | siw_sk_save_upcalls(s->sk); | |
157 | siw_sk_assign_cm_upcalls(s->sk); | |
158 | } | |
159 | ||
160 | static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) | |
161 | { | |
162 | struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); | |
163 | unsigned long flags; | |
164 | ||
165 | if (!cep) | |
166 | return NULL; | |
167 | ||
168 | INIT_LIST_HEAD(&cep->listenq); | |
169 | INIT_LIST_HEAD(&cep->devq); | |
170 | INIT_LIST_HEAD(&cep->work_freelist); | |
171 | ||
172 | kref_init(&cep->ref); | |
173 | cep->state = SIW_EPSTATE_IDLE; | |
174 | init_waitqueue_head(&cep->waitq); | |
175 | spin_lock_init(&cep->lock); | |
176 | cep->sdev = sdev; | |
177 | cep->enhanced_rdma_conn_est = false; | |
178 | ||
179 | spin_lock_irqsave(&sdev->lock, flags); | |
180 | list_add_tail(&cep->devq, &sdev->cep_list); | |
181 | spin_unlock_irqrestore(&sdev->lock, flags); | |
182 | ||
183 | siw_dbg_cep(cep, "new endpoint\n"); | |
184 | return cep; | |
185 | } | |
186 | ||
187 | static void siw_cm_free_work(struct siw_cep *cep) | |
188 | { | |
189 | struct list_head *w, *tmp; | |
190 | struct siw_cm_work *work; | |
191 | ||
192 | list_for_each_safe(w, tmp, &cep->work_freelist) { | |
193 | work = list_entry(w, struct siw_cm_work, list); | |
194 | list_del(&work->list); | |
195 | kfree(work); | |
196 | } | |
197 | } | |
198 | ||
199 | static void siw_cancel_mpatimer(struct siw_cep *cep) | |
200 | { | |
201 | spin_lock_bh(&cep->lock); | |
202 | if (cep->mpa_timer) { | |
203 | if (cancel_delayed_work(&cep->mpa_timer->work)) { | |
204 | siw_cep_put(cep); | |
205 | kfree(cep->mpa_timer); /* not needed again */ | |
206 | } | |
207 | cep->mpa_timer = NULL; | |
208 | } | |
209 | spin_unlock_bh(&cep->lock); | |
210 | } | |
211 | ||
212 | static void siw_put_work(struct siw_cm_work *work) | |
213 | { | |
214 | INIT_LIST_HEAD(&work->list); | |
215 | spin_lock_bh(&work->cep->lock); | |
216 | list_add(&work->list, &work->cep->work_freelist); | |
217 | spin_unlock_bh(&work->cep->lock); | |
218 | } | |
219 | ||
220 | static void siw_cep_set_inuse(struct siw_cep *cep) | |
221 | { | |
222 | unsigned long flags; | |
6c52fdc2 BM |
223 | retry: |
224 | spin_lock_irqsave(&cep->lock, flags); | |
225 | ||
226 | if (cep->in_use) { | |
227 | spin_unlock_irqrestore(&cep->lock, flags); | |
af0653d5 | 228 | wait_event_interruptible(cep->waitq, !cep->in_use); |
6c52fdc2 BM |
229 | if (signal_pending(current)) |
230 | flush_signals(current); | |
231 | goto retry; | |
232 | } else { | |
233 | cep->in_use = 1; | |
234 | spin_unlock_irqrestore(&cep->lock, flags); | |
235 | } | |
236 | } | |
237 | ||
238 | static void siw_cep_set_free(struct siw_cep *cep) | |
239 | { | |
240 | unsigned long flags; | |
241 | ||
242 | spin_lock_irqsave(&cep->lock, flags); | |
243 | cep->in_use = 0; | |
244 | spin_unlock_irqrestore(&cep->lock, flags); | |
245 | ||
246 | wake_up(&cep->waitq); | |
247 | } | |
248 | ||
249 | static void __siw_cep_dealloc(struct kref *ref) | |
250 | { | |
251 | struct siw_cep *cep = container_of(ref, struct siw_cep, ref); | |
252 | struct siw_device *sdev = cep->sdev; | |
253 | unsigned long flags; | |
254 | ||
255 | WARN_ON(cep->listen_cep); | |
256 | ||
257 | /* kfree(NULL) is safe */ | |
258 | kfree(cep->mpa.pdata); | |
259 | spin_lock_bh(&cep->lock); | |
260 | if (!list_empty(&cep->work_freelist)) | |
261 | siw_cm_free_work(cep); | |
262 | spin_unlock_bh(&cep->lock); | |
263 | ||
264 | spin_lock_irqsave(&sdev->lock, flags); | |
265 | list_del(&cep->devq); | |
266 | spin_unlock_irqrestore(&sdev->lock, flags); | |
267 | ||
268 | siw_dbg_cep(cep, "free endpoint\n"); | |
269 | kfree(cep); | |
270 | } | |
271 | ||
272 | static struct siw_cm_work *siw_get_work(struct siw_cep *cep) | |
273 | { | |
274 | struct siw_cm_work *work = NULL; | |
275 | ||
276 | spin_lock_bh(&cep->lock); | |
277 | if (!list_empty(&cep->work_freelist)) { | |
278 | work = list_entry(cep->work_freelist.next, struct siw_cm_work, | |
279 | list); | |
280 | list_del_init(&work->list); | |
281 | } | |
282 | spin_unlock_bh(&cep->lock); | |
283 | return work; | |
284 | } | |
285 | ||
286 | static int siw_cm_alloc_work(struct siw_cep *cep, int num) | |
287 | { | |
288 | struct siw_cm_work *work; | |
289 | ||
290 | while (num--) { | |
291 | work = kmalloc(sizeof(*work), GFP_KERNEL); | |
292 | if (!work) { | |
293 | if (!(list_empty(&cep->work_freelist))) | |
294 | siw_cm_free_work(cep); | |
295 | return -ENOMEM; | |
296 | } | |
297 | work->cep = cep; | |
298 | INIT_LIST_HEAD(&work->list); | |
299 | list_add(&work->list, &cep->work_freelist); | |
300 | } | |
301 | return 0; | |
302 | } | |
303 | ||
304 | /* | |
305 | * siw_cm_upcall() | |
306 | * | |
307 | * Upcall to IWCM to inform about async connection events | |
308 | */ | |
309 | static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, | |
310 | int status) | |
311 | { | |
312 | struct iw_cm_event event; | |
313 | struct iw_cm_id *id; | |
314 | ||
315 | memset(&event, 0, sizeof(event)); | |
316 | event.status = status; | |
317 | event.event = reason; | |
318 | ||
319 | if (reason == IW_CM_EVENT_CONNECT_REQUEST) { | |
320 | event.provider_data = cep; | |
321 | id = cep->listen_cep->cm_id; | |
322 | } else { | |
323 | id = cep->cm_id; | |
324 | } | |
325 | /* Signal IRD and ORD */ | |
326 | if (reason == IW_CM_EVENT_ESTABLISHED || | |
327 | reason == IW_CM_EVENT_CONNECT_REPLY) { | |
328 | /* Signal negotiated IRD/ORD values we will use */ | |
329 | event.ird = cep->ird; | |
330 | event.ord = cep->ord; | |
331 | } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { | |
332 | event.ird = cep->ord; | |
333 | event.ord = cep->ird; | |
334 | } | |
335 | /* Signal private data and address information */ | |
336 | if (reason == IW_CM_EVENT_CONNECT_REQUEST || | |
337 | reason == IW_CM_EVENT_CONNECT_REPLY) { | |
338 | u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); | |
339 | ||
340 | if (pd_len) { | |
341 | /* | |
342 | * hand over MPA private data | |
343 | */ | |
344 | event.private_data_len = pd_len; | |
345 | event.private_data = cep->mpa.pdata; | |
346 | ||
347 | /* Hide MPA V2 IRD/ORD control */ | |
348 | if (cep->enhanced_rdma_conn_est) { | |
349 | event.private_data_len -= | |
350 | sizeof(struct mpa_v2_data); | |
351 | event.private_data += | |
352 | sizeof(struct mpa_v2_data); | |
353 | } | |
354 | } | |
355 | getname_local(cep->sock, &event.local_addr); | |
356 | getname_peer(cep->sock, &event.remote_addr); | |
357 | } | |
358 | siw_dbg_cep(cep, "[QP %u]: id 0x%p, reason=%d, status=%d\n", | |
359 | cep->qp ? qp_id(cep->qp) : -1, id, reason, status); | |
360 | ||
361 | return id->event_handler(id, &event); | |
362 | } | |
363 | ||
364 | /* | |
365 | * siw_qp_cm_drop() | |
366 | * | |
367 | * Drops established LLP connection if present and not already | |
368 | * scheduled for dropping. Called from user context, SQ workqueue | |
369 | * or receive IRQ. Caller signals if socket can be immediately | |
370 | * closed (basically, if not in IRQ). | |
371 | */ | |
372 | void siw_qp_cm_drop(struct siw_qp *qp, int schedule) | |
373 | { | |
374 | struct siw_cep *cep = qp->cep; | |
375 | ||
376 | qp->rx_stream.rx_suspend = 1; | |
377 | qp->tx_ctx.tx_suspend = 1; | |
378 | ||
379 | if (!qp->cep) | |
380 | return; | |
381 | ||
382 | if (schedule) { | |
383 | siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); | |
384 | } else { | |
385 | siw_cep_set_inuse(cep); | |
386 | ||
387 | if (cep->state == SIW_EPSTATE_CLOSED) { | |
388 | siw_dbg_cep(cep, "already closed\n"); | |
389 | goto out; | |
390 | } | |
391 | siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); | |
392 | ||
393 | if (qp->term_info.valid) | |
394 | siw_send_terminate(qp); | |
395 | ||
396 | if (cep->cm_id) { | |
397 | switch (cep->state) { | |
398 | case SIW_EPSTATE_AWAIT_MPAREP: | |
399 | siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, | |
400 | -EINVAL); | |
401 | break; | |
402 | ||
403 | case SIW_EPSTATE_RDMA_MODE: | |
404 | siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); | |
405 | break; | |
406 | ||
407 | case SIW_EPSTATE_IDLE: | |
408 | case SIW_EPSTATE_LISTENING: | |
409 | case SIW_EPSTATE_CONNECTING: | |
410 | case SIW_EPSTATE_AWAIT_MPAREQ: | |
411 | case SIW_EPSTATE_RECVD_MPAREQ: | |
412 | case SIW_EPSTATE_CLOSED: | |
413 | default: | |
414 | break; | |
415 | } | |
416 | cep->cm_id->rem_ref(cep->cm_id); | |
417 | cep->cm_id = NULL; | |
418 | siw_cep_put(cep); | |
419 | } | |
420 | cep->state = SIW_EPSTATE_CLOSED; | |
421 | ||
422 | if (cep->sock) { | |
423 | siw_socket_disassoc(cep->sock); | |
424 | /* | |
425 | * Immediately close socket | |
426 | */ | |
427 | sock_release(cep->sock); | |
428 | cep->sock = NULL; | |
429 | } | |
430 | if (cep->qp) { | |
431 | cep->qp = NULL; | |
432 | siw_qp_put(qp); | |
433 | } | |
434 | out: | |
435 | siw_cep_set_free(cep); | |
436 | } | |
437 | } | |
438 | ||
439 | void siw_cep_put(struct siw_cep *cep) | |
440 | { | |
441 | WARN_ON(kref_read(&cep->ref) < 1); | |
442 | kref_put(&cep->ref, __siw_cep_dealloc); | |
443 | } | |
444 | ||
445 | void siw_cep_get(struct siw_cep *cep) | |
446 | { | |
447 | kref_get(&cep->ref); | |
448 | } | |
449 | ||
450 | /* | |
451 | * Expects params->pd_len in host byte order | |
452 | */ | |
453 | static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) | |
454 | { | |
455 | struct socket *s = cep->sock; | |
456 | struct mpa_rr *rr = &cep->mpa.hdr; | |
457 | struct kvec iov[3]; | |
458 | struct msghdr msg; | |
459 | int rv; | |
460 | int iovec_num = 0; | |
461 | int mpa_len; | |
462 | ||
463 | memset(&msg, 0, sizeof(msg)); | |
464 | ||
465 | iov[iovec_num].iov_base = rr; | |
466 | iov[iovec_num].iov_len = sizeof(*rr); | |
467 | mpa_len = sizeof(*rr); | |
468 | ||
469 | if (cep->enhanced_rdma_conn_est) { | |
470 | iovec_num++; | |
471 | iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; | |
472 | iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); | |
473 | mpa_len += sizeof(cep->mpa.v2_ctrl); | |
474 | } | |
475 | if (pd_len) { | |
476 | iovec_num++; | |
477 | iov[iovec_num].iov_base = (char *)pdata; | |
478 | iov[iovec_num].iov_len = pd_len; | |
479 | mpa_len += pd_len; | |
480 | } | |
481 | if (cep->enhanced_rdma_conn_est) | |
482 | pd_len += sizeof(cep->mpa.v2_ctrl); | |
483 | ||
484 | rr->params.pd_len = cpu_to_be16(pd_len); | |
485 | ||
486 | rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); | |
487 | ||
488 | return rv < 0 ? rv : 0; | |
489 | } | |
490 | ||
491 | /* | |
492 | * Receive MPA Request/Reply header. | |
493 | * | |
494 | * Returns 0 if complete MPA Request/Reply header including | |
495 | * eventual private data was received. Returns -EAGAIN if | |
496 | * header was partially received or negative error code otherwise. | |
497 | * | |
498 | * Context: May be called in process context only | |
499 | */ | |
500 | static int siw_recv_mpa_rr(struct siw_cep *cep) | |
501 | { | |
502 | struct mpa_rr *hdr = &cep->mpa.hdr; | |
503 | struct socket *s = cep->sock; | |
504 | u16 pd_len; | |
505 | int rcvd, to_rcv; | |
506 | ||
507 | if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { | |
508 | rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, | |
509 | sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, | |
510 | 0); | |
511 | if (rcvd <= 0) | |
512 | return -ECONNABORTED; | |
513 | ||
514 | cep->mpa.bytes_rcvd += rcvd; | |
515 | ||
516 | if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) | |
517 | return -EAGAIN; | |
518 | ||
519 | if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) | |
520 | return -EPROTO; | |
521 | } | |
522 | pd_len = be16_to_cpu(hdr->params.pd_len); | |
523 | ||
524 | /* | |
525 | * At least the MPA Request/Reply header (frame not including | |
526 | * private data) has been received. | |
527 | * Receive (or continue receiving) any private data. | |
528 | */ | |
529 | to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); | |
530 | ||
531 | if (!to_rcv) { | |
532 | /* | |
533 | * We must have hdr->params.pd_len == 0 and thus received a | |
534 | * complete MPA Request/Reply frame. | |
535 | * Check against peer protocol violation. | |
536 | */ | |
537 | u32 word; | |
538 | ||
539 | rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); | |
540 | if (rcvd == -EAGAIN) | |
541 | return 0; | |
542 | ||
543 | if (rcvd == 0) { | |
544 | siw_dbg_cep(cep, "peer EOF\n"); | |
545 | return -EPIPE; | |
546 | } | |
547 | if (rcvd < 0) { | |
548 | siw_dbg_cep(cep, "error: %d\n", rcvd); | |
549 | return rcvd; | |
550 | } | |
551 | siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); | |
552 | ||
553 | return -EPROTO; | |
554 | } | |
555 | ||
556 | /* | |
557 | * At this point, we must have hdr->params.pd_len != 0. | |
558 | * A private data buffer gets allocated if hdr->params.pd_len != 0. | |
559 | */ | |
560 | if (!cep->mpa.pdata) { | |
561 | cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); | |
562 | if (!cep->mpa.pdata) | |
563 | return -ENOMEM; | |
564 | } | |
565 | rcvd = ksock_recv( | |
566 | s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), | |
567 | to_rcv + 4, MSG_DONTWAIT); | |
568 | ||
569 | if (rcvd < 0) | |
570 | return rcvd; | |
571 | ||
572 | if (rcvd > to_rcv) | |
573 | return -EPROTO; | |
574 | ||
575 | cep->mpa.bytes_rcvd += rcvd; | |
576 | ||
577 | if (to_rcv == rcvd) { | |
578 | siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); | |
579 | return 0; | |
580 | } | |
581 | return -EAGAIN; | |
582 | } | |
583 | ||
584 | /* | |
585 | * siw_proc_mpareq() | |
586 | * | |
587 | * Read MPA Request from socket and signal new connection to IWCM | |
588 | * if success. Caller must hold lock on corresponding listening CEP. | |
589 | */ | |
590 | static int siw_proc_mpareq(struct siw_cep *cep) | |
591 | { | |
592 | struct mpa_rr *req; | |
593 | int version, rv; | |
594 | u16 pd_len; | |
595 | ||
596 | rv = siw_recv_mpa_rr(cep); | |
597 | if (rv) | |
598 | return rv; | |
599 | ||
600 | req = &cep->mpa.hdr; | |
601 | ||
602 | version = __mpa_rr_revision(req->params.bits); | |
603 | pd_len = be16_to_cpu(req->params.pd_len); | |
604 | ||
605 | if (version > MPA_REVISION_2) | |
606 | /* allow for 0, 1, and 2 only */ | |
607 | return -EPROTO; | |
608 | ||
609 | if (memcmp(req->key, MPA_KEY_REQ, 16)) | |
610 | return -EPROTO; | |
611 | ||
612 | /* Prepare for sending MPA reply */ | |
613 | memcpy(req->key, MPA_KEY_REP, 16); | |
614 | ||
615 | if (version == MPA_REVISION_2 && | |
616 | (req->params.bits & MPA_RR_FLAG_ENHANCED)) { | |
617 | /* | |
618 | * MPA version 2 must signal IRD/ORD values and P2P mode | |
619 | * in private data if header flag MPA_RR_FLAG_ENHANCED | |
620 | * is set. | |
621 | */ | |
622 | if (pd_len < sizeof(struct mpa_v2_data)) | |
623 | goto reject_conn; | |
624 | ||
625 | cep->enhanced_rdma_conn_est = true; | |
626 | } | |
627 | ||
628 | /* MPA Markers: currently not supported. Marker TX to be added. */ | |
629 | if (req->params.bits & MPA_RR_FLAG_MARKERS) | |
630 | goto reject_conn; | |
631 | ||
632 | if (req->params.bits & MPA_RR_FLAG_CRC) { | |
633 | /* | |
634 | * RFC 5044, page 27: CRC MUST be used if peer requests it. | |
635 | * siw specific: 'mpa_crc_strict' parameter to reject | |
636 | * connection with CRC if local CRC off enforced by | |
637 | * 'mpa_crc_strict' module parameter. | |
638 | */ | |
639 | if (!mpa_crc_required && mpa_crc_strict) | |
640 | goto reject_conn; | |
641 | ||
642 | /* Enable CRC if requested by module parameter */ | |
643 | if (mpa_crc_required) | |
644 | req->params.bits |= MPA_RR_FLAG_CRC; | |
645 | } | |
646 | if (cep->enhanced_rdma_conn_est) { | |
647 | struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; | |
648 | ||
649 | /* | |
650 | * Peer requested ORD becomes requested local IRD, | |
651 | * peer requested IRD becomes requested local ORD. | |
652 | * IRD and ORD get limited by global maximum values. | |
653 | */ | |
654 | cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; | |
655 | cep->ord = min(cep->ord, SIW_MAX_ORD_QP); | |
656 | cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; | |
657 | cep->ird = min(cep->ird, SIW_MAX_IRD_QP); | |
658 | ||
659 | /* May get overwritten by locally negotiated values */ | |
660 | cep->mpa.v2_ctrl.ird = htons(cep->ird); | |
661 | cep->mpa.v2_ctrl.ord = htons(cep->ord); | |
662 | ||
663 | /* | |
664 | * Support for peer sent zero length Write or Read to | |
665 | * let local side enter RTS. Writes are preferred. | |
666 | * Sends would require pre-posting a Receive and are | |
667 | * not supported. | |
668 | * Propose zero length Write if none of Read and Write | |
669 | * is indicated. | |
670 | */ | |
671 | if (v2->ird & MPA_V2_PEER_TO_PEER) { | |
672 | cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; | |
673 | ||
674 | if (v2->ord & MPA_V2_RDMA_WRITE_RTR) | |
675 | cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; | |
676 | else if (v2->ord & MPA_V2_RDMA_READ_RTR) | |
677 | cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; | |
678 | else | |
679 | cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; | |
680 | } | |
681 | } | |
682 | ||
683 | cep->state = SIW_EPSTATE_RECVD_MPAREQ; | |
684 | ||
685 | /* Keep reference until IWCM accepts/rejects */ | |
686 | siw_cep_get(cep); | |
687 | rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); | |
688 | if (rv) | |
689 | siw_cep_put(cep); | |
690 | ||
691 | return rv; | |
692 | ||
693 | reject_conn: | |
694 | siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", | |
695 | req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, | |
696 | mpa_crc_required, mpa_crc_strict, | |
697 | req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); | |
698 | ||
699 | req->params.bits &= ~MPA_RR_FLAG_MARKERS; | |
700 | req->params.bits |= MPA_RR_FLAG_REJECT; | |
701 | ||
702 | if (!mpa_crc_required && mpa_crc_strict) | |
703 | req->params.bits &= ~MPA_RR_FLAG_CRC; | |
704 | ||
705 | if (pd_len) | |
706 | kfree(cep->mpa.pdata); | |
707 | ||
708 | cep->mpa.pdata = NULL; | |
709 | ||
710 | siw_send_mpareqrep(cep, NULL, 0); | |
711 | ||
712 | return -EOPNOTSUPP; | |
713 | } | |
714 | ||
715 | static int siw_proc_mpareply(struct siw_cep *cep) | |
716 | { | |
717 | struct siw_qp_attrs qp_attrs; | |
718 | enum siw_qp_attr_mask qp_attr_mask; | |
719 | struct siw_qp *qp = cep->qp; | |
720 | struct mpa_rr *rep; | |
721 | int rv; | |
722 | u16 rep_ord; | |
723 | u16 rep_ird; | |
724 | bool ird_insufficient = false; | |
725 | enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; | |
726 | ||
727 | rv = siw_recv_mpa_rr(cep); | |
728 | if (rv != -EAGAIN) | |
729 | siw_cancel_mpatimer(cep); | |
730 | if (rv) | |
731 | goto out_err; | |
732 | ||
733 | rep = &cep->mpa.hdr; | |
734 | ||
735 | if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { | |
736 | /* allow for 0, 1, and 2 only */ | |
737 | rv = -EPROTO; | |
738 | goto out_err; | |
739 | } | |
740 | if (memcmp(rep->key, MPA_KEY_REP, 16)) { | |
741 | siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, | |
742 | LLP_ECODE_INVALID_REQ_RESP, 0); | |
743 | siw_send_terminate(qp); | |
744 | rv = -EPROTO; | |
745 | goto out_err; | |
746 | } | |
747 | if (rep->params.bits & MPA_RR_FLAG_REJECT) { | |
748 | siw_dbg_cep(cep, "got mpa reject\n"); | |
749 | siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); | |
750 | ||
751 | return -ECONNRESET; | |
752 | } | |
753 | if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { | |
754 | siw_dbg_cep(cep, "peer allows GSO on TX\n"); | |
755 | qp->tx_ctx.gso_seg_limit = 0; | |
756 | } | |
757 | if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || | |
758 | (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || | |
759 | (mpa_crc_strict && !mpa_crc_required && | |
760 | (rep->params.bits & MPA_RR_FLAG_CRC))) { | |
761 | siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", | |
762 | rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, | |
763 | mpa_crc_required, mpa_crc_strict, | |
764 | rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); | |
765 | ||
766 | siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); | |
767 | ||
768 | return -EINVAL; | |
769 | } | |
770 | if (cep->enhanced_rdma_conn_est) { | |
771 | struct mpa_v2_data *v2; | |
772 | ||
773 | if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || | |
774 | !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { | |
775 | /* | |
776 | * Protocol failure: The responder MUST reply with | |
777 | * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. | |
778 | */ | |
779 | siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", | |
780 | __mpa_rr_revision(rep->params.bits), | |
781 | rep->params.bits & MPA_RR_FLAG_ENHANCED ? | |
782 | 1 : | |
783 | 0); | |
784 | ||
785 | siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, | |
786 | -ECONNRESET); | |
787 | return -EINVAL; | |
788 | } | |
789 | v2 = (struct mpa_v2_data *)cep->mpa.pdata; | |
790 | rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; | |
791 | rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; | |
792 | ||
793 | if (cep->ird < rep_ord && | |
794 | (relaxed_ird_negotiation == false || | |
795 | rep_ord > cep->sdev->attrs.max_ird)) { | |
796 | siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", | |
797 | cep->ird, rep_ord, | |
798 | cep->sdev->attrs.max_ord); | |
799 | ird_insufficient = true; | |
800 | } | |
801 | if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { | |
802 | siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, | |
803 | rep_ird); | |
804 | ird_insufficient = true; | |
805 | } | |
806 | /* | |
807 | * Always report negotiated peer values to user, | |
808 | * even if IRD/ORD negotiation failed | |
809 | */ | |
810 | cep->ird = rep_ord; | |
811 | cep->ord = rep_ird; | |
812 | ||
813 | if (ird_insufficient) { | |
814 | /* | |
815 | * If the initiator IRD is insuffient for the | |
816 | * responder ORD, send a TERM. | |
817 | */ | |
818 | siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, | |
819 | LLP_ETYPE_MPA, | |
820 | LLP_ECODE_INSUFFICIENT_IRD, 0); | |
821 | siw_send_terminate(qp); | |
822 | rv = -ENOMEM; | |
823 | goto out_err; | |
824 | } | |
825 | if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) | |
826 | mpa_p2p_mode = | |
827 | cep->mpa.v2_ctrl_req.ord & | |
828 | (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); | |
829 | ||
830 | /* | |
831 | * Check if we requested P2P mode, and if peer agrees | |
832 | */ | |
833 | if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { | |
834 | if ((mpa_p2p_mode & v2->ord) == 0) { | |
835 | /* | |
836 | * We requested RTR mode(s), but the peer | |
837 | * did not pick any mode we support. | |
838 | */ | |
839 | siw_dbg_cep(cep, | |
840 | "rtr mode: req %2x, got %2x\n", | |
841 | mpa_p2p_mode, | |
842 | v2->ord & (MPA_V2_RDMA_WRITE_RTR | | |
843 | MPA_V2_RDMA_READ_RTR)); | |
844 | ||
845 | siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, | |
846 | LLP_ETYPE_MPA, | |
847 | LLP_ECODE_NO_MATCHING_RTR, | |
848 | 0); | |
849 | siw_send_terminate(qp); | |
850 | rv = -EPROTO; | |
851 | goto out_err; | |
852 | } | |
853 | mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | | |
854 | MPA_V2_RDMA_READ_RTR); | |
855 | } | |
856 | } | |
857 | memset(&qp_attrs, 0, sizeof(qp_attrs)); | |
858 | ||
859 | if (rep->params.bits & MPA_RR_FLAG_CRC) | |
860 | qp_attrs.flags = SIW_MPA_CRC; | |
861 | ||
862 | qp_attrs.irq_size = cep->ird; | |
863 | qp_attrs.orq_size = cep->ord; | |
864 | qp_attrs.sk = cep->sock; | |
865 | qp_attrs.state = SIW_QP_STATE_RTS; | |
866 | ||
867 | qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | | |
868 | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; | |
869 | ||
870 | /* Move socket RX/TX under QP control */ | |
871 | down_write(&qp->state_lock); | |
872 | if (qp->attrs.state > SIW_QP_STATE_RTR) { | |
873 | rv = -EINVAL; | |
874 | up_write(&qp->state_lock); | |
875 | goto out_err; | |
876 | } | |
877 | rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); | |
878 | ||
879 | siw_qp_socket_assoc(cep, qp); | |
880 | ||
881 | up_write(&qp->state_lock); | |
882 | ||
883 | /* Send extra RDMA frame to trigger peer RTS if negotiated */ | |
884 | if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { | |
885 | rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); | |
886 | if (rv) | |
887 | goto out_err; | |
888 | } | |
889 | if (!rv) { | |
890 | rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); | |
891 | if (!rv) | |
892 | cep->state = SIW_EPSTATE_RDMA_MODE; | |
893 | ||
894 | return 0; | |
895 | } | |
896 | ||
897 | out_err: | |
898 | siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); | |
899 | ||
900 | return rv; | |
901 | } | |
902 | ||
903 | /* | |
904 | * siw_accept_newconn - accept an incoming pending connection | |
905 | * | |
906 | */ | |
907 | static void siw_accept_newconn(struct siw_cep *cep) | |
908 | { | |
909 | struct socket *s = cep->sock; | |
910 | struct socket *new_s = NULL; | |
911 | struct siw_cep *new_cep = NULL; | |
912 | int rv = 0; /* debug only. should disappear */ | |
913 | ||
914 | if (cep->state != SIW_EPSTATE_LISTENING) | |
915 | goto error; | |
916 | ||
917 | new_cep = siw_cep_alloc(cep->sdev); | |
918 | if (!new_cep) | |
919 | goto error; | |
920 | ||
921 | /* | |
922 | * 4: Allocate a sufficient number of work elements | |
923 | * to allow concurrent handling of local + peer close | |
924 | * events, MPA header processing + MPA timeout. | |
925 | */ | |
926 | if (siw_cm_alloc_work(new_cep, 4) != 0) | |
927 | goto error; | |
928 | ||
929 | /* | |
930 | * Copy saved socket callbacks from listening CEP | |
931 | * and assign new socket with new CEP | |
932 | */ | |
933 | new_cep->sk_state_change = cep->sk_state_change; | |
934 | new_cep->sk_data_ready = cep->sk_data_ready; | |
935 | new_cep->sk_write_space = cep->sk_write_space; | |
936 | new_cep->sk_error_report = cep->sk_error_report; | |
937 | ||
938 | rv = kernel_accept(s, &new_s, O_NONBLOCK); | |
939 | if (rv != 0) { | |
940 | /* | |
941 | * Connection already aborted by peer..? | |
942 | */ | |
943 | siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); | |
944 | goto error; | |
945 | } | |
946 | new_cep->sock = new_s; | |
947 | siw_cep_get(new_cep); | |
948 | new_s->sk->sk_user_data = new_cep; | |
949 | ||
950 | siw_dbg_cep(cep, "listen socket 0x%p, new 0x%p\n", s, new_s); | |
951 | ||
952 | if (siw_tcp_nagle == false) { | |
953 | int val = 1; | |
954 | ||
955 | rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, | |
956 | (char *)&val, sizeof(val)); | |
957 | if (rv) { | |
958 | siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); | |
959 | goto error; | |
960 | } | |
961 | } | |
962 | new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; | |
963 | ||
964 | rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); | |
965 | if (rv) | |
966 | goto error; | |
967 | /* | |
968 | * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. | |
969 | */ | |
970 | new_cep->listen_cep = cep; | |
971 | siw_cep_get(cep); | |
972 | ||
973 | if (atomic_read(&new_s->sk->sk_rmem_alloc)) { | |
974 | /* | |
975 | * MPA REQ already queued | |
976 | */ | |
977 | siw_dbg_cep(cep, "immediate mpa request\n"); | |
978 | ||
979 | siw_cep_set_inuse(new_cep); | |
980 | rv = siw_proc_mpareq(new_cep); | |
981 | siw_cep_set_free(new_cep); | |
982 | ||
983 | if (rv != -EAGAIN) { | |
984 | siw_cep_put(cep); | |
985 | new_cep->listen_cep = NULL; | |
986 | if (rv) | |
987 | goto error; | |
988 | } | |
989 | } | |
990 | return; | |
991 | ||
992 | error: | |
993 | if (new_cep) | |
994 | siw_cep_put(new_cep); | |
995 | ||
996 | if (new_s) { | |
997 | siw_socket_disassoc(new_s); | |
998 | sock_release(new_s); | |
999 | new_cep->sock = NULL; | |
1000 | } | |
1001 | siw_dbg_cep(cep, "error %d\n", rv); | |
1002 | } | |
1003 | ||
1004 | static void siw_cm_work_handler(struct work_struct *w) | |
1005 | { | |
1006 | struct siw_cm_work *work; | |
1007 | struct siw_cep *cep; | |
1008 | int release_cep = 0, rv = 0; | |
1009 | ||
1010 | work = container_of(w, struct siw_cm_work, work.work); | |
1011 | cep = work->cep; | |
1012 | ||
1013 | siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", | |
1014 | cep->qp ? qp_id(cep->qp) : -1, work->type, cep->state); | |
1015 | ||
1016 | siw_cep_set_inuse(cep); | |
1017 | ||
1018 | switch (work->type) { | |
1019 | case SIW_CM_WORK_ACCEPT: | |
1020 | siw_accept_newconn(cep); | |
1021 | break; | |
1022 | ||
1023 | case SIW_CM_WORK_READ_MPAHDR: | |
1024 | if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { | |
1025 | if (cep->listen_cep) { | |
1026 | siw_cep_set_inuse(cep->listen_cep); | |
1027 | ||
1028 | if (cep->listen_cep->state == | |
1029 | SIW_EPSTATE_LISTENING) | |
1030 | rv = siw_proc_mpareq(cep); | |
1031 | else | |
1032 | rv = -EFAULT; | |
1033 | ||
1034 | siw_cep_set_free(cep->listen_cep); | |
1035 | ||
1036 | if (rv != -EAGAIN) { | |
1037 | siw_cep_put(cep->listen_cep); | |
1038 | cep->listen_cep = NULL; | |
1039 | if (rv) | |
1040 | siw_cep_put(cep); | |
1041 | } | |
1042 | } | |
1043 | } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { | |
1044 | rv = siw_proc_mpareply(cep); | |
1045 | } else { | |
1046 | /* | |
1047 | * CEP already moved out of MPA handshake. | |
1048 | * any connection management already done. | |
1049 | * silently ignore the mpa packet. | |
1050 | */ | |
1051 | if (cep->state == SIW_EPSTATE_RDMA_MODE) { | |
1052 | cep->sock->sk->sk_data_ready(cep->sock->sk); | |
1053 | siw_dbg_cep(cep, "already in RDMA mode"); | |
1054 | } else { | |
1055 | siw_dbg_cep(cep, "out of state: %d\n", | |
1056 | cep->state); | |
1057 | } | |
1058 | } | |
1059 | if (rv && rv != EAGAIN) | |
1060 | release_cep = 1; | |
1061 | break; | |
1062 | ||
1063 | case SIW_CM_WORK_CLOSE_LLP: | |
1064 | /* | |
1065 | * QP scheduled LLP close | |
1066 | */ | |
1067 | if (cep->qp && cep->qp->term_info.valid) | |
1068 | siw_send_terminate(cep->qp); | |
1069 | ||
1070 | if (cep->cm_id) | |
1071 | siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); | |
1072 | ||
1073 | release_cep = 1; | |
1074 | break; | |
1075 | ||
1076 | case SIW_CM_WORK_PEER_CLOSE: | |
1077 | if (cep->cm_id) { | |
1078 | if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { | |
1079 | /* | |
1080 | * MPA reply not received, but connection drop | |
1081 | */ | |
1082 | siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, | |
1083 | -ECONNRESET); | |
1084 | } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { | |
1085 | /* | |
1086 | * NOTE: IW_CM_EVENT_DISCONNECT is given just | |
1087 | * to transition IWCM into CLOSING. | |
1088 | */ | |
1089 | siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); | |
1090 | siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); | |
1091 | } | |
1092 | /* | |
1093 | * for other states there is no connection | |
1094 | * known to the IWCM. | |
1095 | */ | |
1096 | } else { | |
1097 | if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { | |
1098 | /* | |
1099 | * Wait for the ulp/CM to call accept/reject | |
1100 | */ | |
1101 | siw_dbg_cep(cep, | |
1102 | "mpa req recvd, wait for ULP\n"); | |
1103 | } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { | |
1104 | /* | |
1105 | * Socket close before MPA request received. | |
1106 | */ | |
1107 | siw_dbg_cep(cep, "no mpareq: drop listener\n"); | |
1108 | siw_cep_put(cep->listen_cep); | |
1109 | cep->listen_cep = NULL; | |
1110 | } | |
1111 | } | |
1112 | release_cep = 1; | |
1113 | break; | |
1114 | ||
1115 | case SIW_CM_WORK_MPATIMEOUT: | |
1116 | cep->mpa_timer = NULL; | |
1117 | ||
1118 | if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { | |
1119 | /* | |
1120 | * MPA request timed out: | |
1121 | * Hide any partially received private data and signal | |
1122 | * timeout | |
1123 | */ | |
1124 | cep->mpa.hdr.params.pd_len = 0; | |
1125 | ||
1126 | if (cep->cm_id) | |
1127 | siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, | |
1128 | -ETIMEDOUT); | |
1129 | release_cep = 1; | |
1130 | ||
1131 | } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { | |
1132 | /* | |
1133 | * No MPA request received after peer TCP stream setup. | |
1134 | */ | |
1135 | if (cep->listen_cep) { | |
1136 | siw_cep_put(cep->listen_cep); | |
1137 | cep->listen_cep = NULL; | |
1138 | } | |
1139 | release_cep = 1; | |
1140 | } | |
1141 | break; | |
1142 | ||
1143 | default: | |
1144 | WARN(1, "Undefined CM work type: %d\n", work->type); | |
1145 | } | |
1146 | if (release_cep) { | |
1147 | siw_dbg_cep(cep, | |
1148 | "release: timer=%s, QP[%u], id 0x%p\n", | |
1149 | cep->mpa_timer ? "y" : "n", | |
1150 | cep->qp ? qp_id(cep->qp) : -1, cep->cm_id); | |
1151 | ||
1152 | siw_cancel_mpatimer(cep); | |
1153 | ||
1154 | cep->state = SIW_EPSTATE_CLOSED; | |
1155 | ||
1156 | if (cep->qp) { | |
1157 | struct siw_qp *qp = cep->qp; | |
1158 | /* | |
1159 | * Serialize a potential race with application | |
1160 | * closing the QP and calling siw_qp_cm_drop() | |
1161 | */ | |
1162 | siw_qp_get(qp); | |
1163 | siw_cep_set_free(cep); | |
1164 | ||
1165 | siw_qp_llp_close(qp); | |
1166 | siw_qp_put(qp); | |
1167 | ||
1168 | siw_cep_set_inuse(cep); | |
1169 | cep->qp = NULL; | |
1170 | siw_qp_put(qp); | |
1171 | } | |
1172 | if (cep->sock) { | |
1173 | siw_socket_disassoc(cep->sock); | |
1174 | sock_release(cep->sock); | |
1175 | cep->sock = NULL; | |
1176 | } | |
1177 | if (cep->cm_id) { | |
1178 | cep->cm_id->rem_ref(cep->cm_id); | |
1179 | cep->cm_id = NULL; | |
1180 | siw_cep_put(cep); | |
1181 | } | |
1182 | } | |
1183 | siw_cep_set_free(cep); | |
1184 | siw_put_work(work); | |
1185 | siw_cep_put(cep); | |
1186 | } | |
1187 | ||
1188 | static struct workqueue_struct *siw_cm_wq; | |
1189 | ||
1190 | int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) | |
1191 | { | |
1192 | struct siw_cm_work *work = siw_get_work(cep); | |
1193 | unsigned long delay = 0; | |
1194 | ||
1195 | if (!work) { | |
1196 | siw_dbg_cep(cep, "failed with no work available\n"); | |
1197 | return -ENOMEM; | |
1198 | } | |
1199 | work->type = type; | |
1200 | work->cep = cep; | |
1201 | ||
1202 | siw_cep_get(cep); | |
1203 | ||
1204 | INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); | |
1205 | ||
1206 | if (type == SIW_CM_WORK_MPATIMEOUT) { | |
1207 | cep->mpa_timer = work; | |
1208 | ||
1209 | if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) | |
1210 | delay = MPAREQ_TIMEOUT; | |
1211 | else | |
1212 | delay = MPAREP_TIMEOUT; | |
1213 | } | |
1214 | siw_dbg_cep(cep, "[QP %u]: work type: %d, work 0x%p, timeout %lu\n", | |
1215 | cep->qp ? qp_id(cep->qp) : -1, type, work, delay); | |
1216 | ||
1217 | queue_delayed_work(siw_cm_wq, &work->work, delay); | |
1218 | ||
1219 | return 0; | |
1220 | } | |
1221 | ||
1222 | static void siw_cm_llp_data_ready(struct sock *sk) | |
1223 | { | |
1224 | struct siw_cep *cep; | |
1225 | ||
1226 | read_lock(&sk->sk_callback_lock); | |
1227 | ||
1228 | cep = sk_to_cep(sk); | |
1229 | if (!cep) { | |
1230 | WARN_ON(1); | |
1231 | goto out; | |
1232 | } | |
1233 | siw_dbg_cep(cep, "state: %d\n", cep->state); | |
1234 | ||
1235 | switch (cep->state) { | |
1236 | case SIW_EPSTATE_RDMA_MODE: | |
1237 | /* fall through */ | |
1238 | case SIW_EPSTATE_LISTENING: | |
1239 | break; | |
1240 | ||
1241 | case SIW_EPSTATE_AWAIT_MPAREQ: | |
1242 | /* fall through */ | |
1243 | case SIW_EPSTATE_AWAIT_MPAREP: | |
1244 | siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); | |
1245 | break; | |
1246 | ||
1247 | default: | |
1248 | siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); | |
1249 | break; | |
1250 | } | |
1251 | out: | |
1252 | read_unlock(&sk->sk_callback_lock); | |
1253 | } | |
1254 | ||
1255 | static void siw_cm_llp_write_space(struct sock *sk) | |
1256 | { | |
1257 | struct siw_cep *cep = sk_to_cep(sk); | |
1258 | ||
1259 | if (cep) | |
1260 | siw_dbg_cep(cep, "state: %d\n", cep->state); | |
1261 | } | |
1262 | ||
1263 | static void siw_cm_llp_error_report(struct sock *sk) | |
1264 | { | |
1265 | struct siw_cep *cep = sk_to_cep(sk); | |
1266 | ||
1267 | if (cep) { | |
1268 | siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", | |
1269 | sk->sk_err, sk->sk_state, cep->state); | |
1270 | cep->sk_error_report(sk); | |
1271 | } | |
1272 | } | |
1273 | ||
1274 | static void siw_cm_llp_state_change(struct sock *sk) | |
1275 | { | |
1276 | struct siw_cep *cep; | |
6c52fdc2 BM |
1277 | void (*orig_state_change)(struct sock *s); |
1278 | ||
1279 | read_lock(&sk->sk_callback_lock); | |
1280 | ||
1281 | cep = sk_to_cep(sk); | |
1282 | if (!cep) { | |
1283 | /* endpoint already disassociated */ | |
1284 | read_unlock(&sk->sk_callback_lock); | |
1285 | return; | |
1286 | } | |
1287 | orig_state_change = cep->sk_state_change; | |
1288 | ||
6c52fdc2 BM |
1289 | siw_dbg_cep(cep, "state: %d\n", cep->state); |
1290 | ||
1291 | switch (sk->sk_state) { | |
1292 | case TCP_ESTABLISHED: | |
1293 | /* | |
1294 | * handle accepting socket as special case where only | |
1295 | * new connection is possible | |
1296 | */ | |
1297 | siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); | |
1298 | break; | |
1299 | ||
1300 | case TCP_CLOSE: | |
1301 | case TCP_CLOSE_WAIT: | |
1302 | if (cep->qp) | |
1303 | cep->qp->tx_ctx.tx_suspend = 1; | |
1304 | siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); | |
1305 | break; | |
1306 | ||
1307 | default: | |
1308 | siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); | |
1309 | } | |
1310 | read_unlock(&sk->sk_callback_lock); | |
1311 | orig_state_change(sk); | |
1312 | } | |
1313 | ||
1314 | static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, | |
1315 | struct sockaddr *raddr) | |
1316 | { | |
1317 | int rv, flags = 0, s_val = 1; | |
1318 | size_t size = laddr->sa_family == AF_INET ? | |
1319 | sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); | |
1320 | ||
1321 | /* | |
1322 | * Make address available again asap. | |
1323 | */ | |
1324 | rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, | |
1325 | sizeof(s_val)); | |
1326 | if (rv < 0) | |
1327 | return rv; | |
1328 | ||
1329 | rv = s->ops->bind(s, laddr, size); | |
1330 | if (rv < 0) | |
1331 | return rv; | |
1332 | ||
1333 | rv = s->ops->connect(s, raddr, size, flags); | |
1334 | ||
1335 | return rv < 0 ? rv : 0; | |
1336 | } | |
1337 | ||
1338 | int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) | |
1339 | { | |
1340 | struct siw_device *sdev = to_siw_dev(id->device); | |
1341 | struct siw_qp *qp; | |
1342 | struct siw_cep *cep = NULL; | |
1343 | struct socket *s = NULL; | |
1344 | struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, | |
1345 | *raddr = (struct sockaddr *)&id->remote_addr; | |
1346 | bool p2p_mode = peer_to_peer, v4 = true; | |
1347 | u16 pd_len = params->private_data_len; | |
1348 | int version = mpa_version, rv; | |
1349 | ||
1350 | if (pd_len > MPA_MAX_PRIVDATA) | |
1351 | return -EINVAL; | |
1352 | ||
1353 | if (params->ird > sdev->attrs.max_ird || | |
1354 | params->ord > sdev->attrs.max_ord) | |
1355 | return -ENOMEM; | |
1356 | ||
1357 | if (laddr->sa_family == AF_INET6) | |
1358 | v4 = false; | |
1359 | else if (laddr->sa_family != AF_INET) | |
1360 | return -EAFNOSUPPORT; | |
1361 | ||
1362 | /* | |
1363 | * Respect any iwarp port mapping: Use mapped remote address | |
1364 | * if valid. Local address must not be mapped, since siw | |
1365 | * uses kernel TCP stack. | |
1366 | */ | |
1367 | if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || | |
1368 | to_sockaddr_in6(id->remote_addr).sin6_port != 0) | |
1369 | raddr = (struct sockaddr *)&id->m_remote_addr; | |
1370 | ||
1371 | qp = siw_qp_id2obj(sdev, params->qpn); | |
1372 | if (!qp) { | |
1373 | WARN(1, "[QP %u] does not exist\n", params->qpn); | |
1374 | rv = -EINVAL; | |
1375 | goto error; | |
1376 | } | |
1377 | if (v4) | |
1378 | siw_dbg_qp(qp, | |
1379 | "id 0x%p, pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n", | |
1380 | id, pd_len, | |
1381 | &((struct sockaddr_in *)(laddr))->sin_addr, | |
1382 | ntohs(((struct sockaddr_in *)(laddr))->sin_port), | |
1383 | &((struct sockaddr_in *)(raddr))->sin_addr, | |
1384 | ntohs(((struct sockaddr_in *)(raddr))->sin_port)); | |
1385 | else | |
1386 | siw_dbg_qp(qp, | |
1387 | "id 0x%p, pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n", | |
1388 | id, pd_len, | |
1389 | &((struct sockaddr_in6 *)(laddr))->sin6_addr, | |
1390 | ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port), | |
1391 | &((struct sockaddr_in6 *)(raddr))->sin6_addr, | |
1392 | ntohs(((struct sockaddr_in6 *)(raddr))->sin6_port)); | |
1393 | ||
1394 | rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); | |
1395 | if (rv < 0) | |
1396 | goto error; | |
1397 | ||
1398 | /* | |
1399 | * NOTE: For simplification, connect() is called in blocking | |
1400 | * mode. Might be reconsidered for async connection setup at | |
1401 | * TCP level. | |
1402 | */ | |
1403 | rv = kernel_bindconnect(s, laddr, raddr); | |
1404 | if (rv != 0) { | |
1405 | siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); | |
1406 | goto error; | |
1407 | } | |
1408 | if (siw_tcp_nagle == false) { | |
1409 | int val = 1; | |
1410 | ||
1411 | rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, | |
1412 | sizeof(val)); | |
1413 | if (rv) { | |
1414 | siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); | |
1415 | goto error; | |
1416 | } | |
1417 | } | |
1418 | cep = siw_cep_alloc(sdev); | |
1419 | if (!cep) { | |
1420 | rv = -ENOMEM; | |
1421 | goto error; | |
1422 | } | |
1423 | siw_cep_set_inuse(cep); | |
1424 | ||
1425 | /* Associate QP with CEP */ | |
1426 | siw_cep_get(cep); | |
1427 | qp->cep = cep; | |
1428 | ||
1429 | /* siw_qp_get(qp) already done by QP lookup */ | |
1430 | cep->qp = qp; | |
1431 | ||
1432 | id->add_ref(id); | |
1433 | cep->cm_id = id; | |
1434 | ||
1435 | /* | |
1436 | * 4: Allocate a sufficient number of work elements | |
1437 | * to allow concurrent handling of local + peer close | |
1438 | * events, MPA header processing + MPA timeout. | |
1439 | */ | |
1440 | rv = siw_cm_alloc_work(cep, 4); | |
1441 | if (rv != 0) { | |
1442 | rv = -ENOMEM; | |
1443 | goto error; | |
1444 | } | |
1445 | cep->ird = params->ird; | |
1446 | cep->ord = params->ord; | |
1447 | ||
1448 | if (p2p_mode && cep->ord == 0) | |
1449 | cep->ord = 1; | |
1450 | ||
1451 | cep->state = SIW_EPSTATE_CONNECTING; | |
1452 | ||
1453 | /* | |
1454 | * Associate CEP with socket | |
1455 | */ | |
1456 | siw_cep_socket_assoc(cep, s); | |
1457 | ||
1458 | cep->state = SIW_EPSTATE_AWAIT_MPAREP; | |
1459 | ||
1460 | /* | |
1461 | * Set MPA Request bits: CRC if required, no MPA Markers, | |
1462 | * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. | |
1463 | */ | |
1464 | cep->mpa.hdr.params.bits = 0; | |
1465 | if (version > MPA_REVISION_2) { | |
1466 | pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); | |
1467 | version = MPA_REVISION_2; | |
1468 | /* Adjust also module parameter */ | |
1469 | mpa_version = MPA_REVISION_2; | |
1470 | } | |
1471 | __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); | |
1472 | ||
1473 | if (try_gso) | |
1474 | cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; | |
1475 | ||
1476 | if (mpa_crc_required) | |
1477 | cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; | |
1478 | ||
1479 | /* | |
1480 | * If MPA version == 2: | |
1481 | * o Include ORD and IRD. | |
1482 | * o Indicate peer-to-peer mode, if required by module | |
1483 | * parameter 'peer_to_peer'. | |
1484 | */ | |
1485 | if (version == MPA_REVISION_2) { | |
1486 | cep->enhanced_rdma_conn_est = true; | |
1487 | cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; | |
1488 | ||
1489 | cep->mpa.v2_ctrl.ird = htons(cep->ird); | |
1490 | cep->mpa.v2_ctrl.ord = htons(cep->ord); | |
1491 | ||
1492 | if (p2p_mode) { | |
1493 | cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; | |
1494 | cep->mpa.v2_ctrl.ord |= rtr_type; | |
1495 | } | |
1496 | /* Remember own P2P mode requested */ | |
1497 | cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; | |
1498 | cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; | |
1499 | } | |
1500 | memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); | |
1501 | ||
1502 | rv = siw_send_mpareqrep(cep, params->private_data, pd_len); | |
1503 | /* | |
1504 | * Reset private data. | |
1505 | */ | |
1506 | cep->mpa.hdr.params.pd_len = 0; | |
1507 | ||
1508 | if (rv >= 0) { | |
1509 | rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); | |
1510 | if (!rv) { | |
1511 | siw_dbg_cep(cep, "id 0x%p, [QP %u]: exit\n", id, | |
1512 | qp_id(qp)); | |
1513 | siw_cep_set_free(cep); | |
1514 | return 0; | |
1515 | } | |
1516 | } | |
1517 | error: | |
1518 | siw_dbg_qp(qp, "failed: %d\n", rv); | |
1519 | ||
1520 | if (cep) { | |
1521 | siw_socket_disassoc(s); | |
1522 | sock_release(s); | |
1523 | cep->sock = NULL; | |
1524 | ||
1525 | cep->qp = NULL; | |
1526 | ||
1527 | cep->cm_id = NULL; | |
1528 | id->rem_ref(id); | |
1529 | siw_cep_put(cep); | |
1530 | ||
1531 | qp->cep = NULL; | |
1532 | siw_cep_put(cep); | |
1533 | ||
1534 | cep->state = SIW_EPSTATE_CLOSED; | |
1535 | ||
1536 | siw_cep_set_free(cep); | |
1537 | ||
1538 | siw_cep_put(cep); | |
1539 | ||
1540 | } else if (s) { | |
1541 | sock_release(s); | |
1542 | } | |
1543 | siw_qp_put(qp); | |
1544 | ||
1545 | return rv; | |
1546 | } | |
1547 | ||
1548 | /* | |
1549 | * siw_accept - Let SoftiWARP accept an RDMA connection request | |
1550 | * | |
1551 | * @id: New connection management id to be used for accepted | |
1552 | * connection request | |
1553 | * @params: Connection parameters provided by ULP for accepting connection | |
1554 | * | |
1555 | * Transition QP to RTS state, associate new CM id @id with accepted CEP | |
1556 | * and get prepared for TCP input by installing socket callbacks. | |
1557 | * Then send MPA Reply and generate the "connection established" event. | |
1558 | * Socket callbacks must be installed before sending MPA Reply, because | |
1559 | * the latter may cause a first RDMA message to arrive from the RDMA Initiator | |
1560 | * side very quickly, at which time the socket callbacks must be ready. | |
1561 | */ | |
1562 | int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) | |
1563 | { | |
1564 | struct siw_device *sdev = to_siw_dev(id->device); | |
1565 | struct siw_cep *cep = (struct siw_cep *)id->provider_data; | |
1566 | struct siw_qp *qp; | |
1567 | struct siw_qp_attrs qp_attrs; | |
1568 | int rv, max_priv_data = MPA_MAX_PRIVDATA; | |
1569 | bool wait_for_peer_rts = false; | |
1570 | ||
1571 | siw_cep_set_inuse(cep); | |
1572 | siw_cep_put(cep); | |
1573 | ||
1574 | /* Free lingering inbound private data */ | |
1575 | if (cep->mpa.hdr.params.pd_len) { | |
1576 | cep->mpa.hdr.params.pd_len = 0; | |
1577 | kfree(cep->mpa.pdata); | |
1578 | cep->mpa.pdata = NULL; | |
1579 | } | |
1580 | siw_cancel_mpatimer(cep); | |
1581 | ||
1582 | if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { | |
1583 | siw_dbg_cep(cep, "id 0x%p: out of state\n", id); | |
1584 | ||
1585 | siw_cep_set_free(cep); | |
1586 | siw_cep_put(cep); | |
1587 | ||
1588 | return -ECONNRESET; | |
1589 | } | |
1590 | qp = siw_qp_id2obj(sdev, params->qpn); | |
1591 | if (!qp) { | |
1592 | WARN(1, "[QP %d] does not exist\n", params->qpn); | |
1593 | siw_cep_set_free(cep); | |
1594 | siw_cep_put(cep); | |
1595 | ||
1596 | return -EINVAL; | |
1597 | } | |
1598 | down_write(&qp->state_lock); | |
1599 | if (qp->attrs.state > SIW_QP_STATE_RTR) { | |
1600 | rv = -EINVAL; | |
1601 | up_write(&qp->state_lock); | |
1602 | goto error; | |
1603 | } | |
1604 | siw_dbg_cep(cep, "id 0x%p\n", id); | |
1605 | ||
1606 | if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { | |
1607 | siw_dbg_cep(cep, "peer allows GSO on TX\n"); | |
1608 | qp->tx_ctx.gso_seg_limit = 0; | |
1609 | } | |
1610 | if (params->ord > sdev->attrs.max_ord || | |
1611 | params->ird > sdev->attrs.max_ird) { | |
1612 | siw_dbg_cep( | |
1613 | cep, | |
1614 | "id 0x%p, [QP %u]: ord %d (max %d), ird %d (max %d)\n", | |
1615 | id, qp_id(qp), params->ord, sdev->attrs.max_ord, | |
1616 | params->ird, sdev->attrs.max_ird); | |
1617 | rv = -EINVAL; | |
1618 | up_write(&qp->state_lock); | |
1619 | goto error; | |
1620 | } | |
1621 | if (cep->enhanced_rdma_conn_est) | |
1622 | max_priv_data -= sizeof(struct mpa_v2_data); | |
1623 | ||
1624 | if (params->private_data_len > max_priv_data) { | |
1625 | siw_dbg_cep( | |
1626 | cep, | |
1627 | "id 0x%p, [QP %u]: private data length: %d (max %d)\n", | |
1628 | id, qp_id(qp), params->private_data_len, max_priv_data); | |
1629 | rv = -EINVAL; | |
1630 | up_write(&qp->state_lock); | |
1631 | goto error; | |
1632 | } | |
1633 | if (cep->enhanced_rdma_conn_est) { | |
1634 | if (params->ord > cep->ord) { | |
1635 | if (relaxed_ird_negotiation) { | |
1636 | params->ord = cep->ord; | |
1637 | } else { | |
1638 | cep->ird = params->ird; | |
1639 | cep->ord = params->ord; | |
1640 | rv = -EINVAL; | |
1641 | up_write(&qp->state_lock); | |
1642 | goto error; | |
1643 | } | |
1644 | } | |
1645 | if (params->ird < cep->ird) { | |
1646 | if (relaxed_ird_negotiation && | |
1647 | cep->ird <= sdev->attrs.max_ird) | |
1648 | params->ird = cep->ird; | |
1649 | else { | |
1650 | rv = -ENOMEM; | |
1651 | up_write(&qp->state_lock); | |
1652 | goto error; | |
1653 | } | |
1654 | } | |
1655 | if (cep->mpa.v2_ctrl.ord & | |
1656 | (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) | |
1657 | wait_for_peer_rts = true; | |
1658 | /* | |
1659 | * Signal back negotiated IRD and ORD values | |
1660 | */ | |
1661 | cep->mpa.v2_ctrl.ord = | |
1662 | htons(params->ord & MPA_IRD_ORD_MASK) | | |
1663 | (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); | |
1664 | cep->mpa.v2_ctrl.ird = | |
1665 | htons(params->ird & MPA_IRD_ORD_MASK) | | |
1666 | (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); | |
1667 | } | |
1668 | cep->ird = params->ird; | |
1669 | cep->ord = params->ord; | |
1670 | ||
1671 | cep->cm_id = id; | |
1672 | id->add_ref(id); | |
1673 | ||
1674 | memset(&qp_attrs, 0, sizeof(qp_attrs)); | |
1675 | qp_attrs.orq_size = cep->ord; | |
1676 | qp_attrs.irq_size = cep->ird; | |
1677 | qp_attrs.sk = cep->sock; | |
1678 | if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) | |
1679 | qp_attrs.flags = SIW_MPA_CRC; | |
1680 | qp_attrs.state = SIW_QP_STATE_RTS; | |
1681 | ||
1682 | siw_dbg_cep(cep, "id 0x%p, [QP%u]: moving to rts\n", id, qp_id(qp)); | |
1683 | ||
1684 | /* Associate QP with CEP */ | |
1685 | siw_cep_get(cep); | |
1686 | qp->cep = cep; | |
1687 | ||
1688 | /* siw_qp_get(qp) already done by QP lookup */ | |
1689 | cep->qp = qp; | |
1690 | ||
1691 | cep->state = SIW_EPSTATE_RDMA_MODE; | |
1692 | ||
1693 | /* Move socket RX/TX under QP control */ | |
1694 | rv = siw_qp_modify(qp, &qp_attrs, | |
1695 | SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | | |
1696 | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | | |
1697 | SIW_QP_ATTR_MPA); | |
1698 | up_write(&qp->state_lock); | |
1699 | ||
1700 | if (rv) | |
1701 | goto error; | |
1702 | ||
1703 | siw_dbg_cep(cep, "id 0x%p, [QP %u]: send mpa reply, %d byte pdata\n", | |
1704 | id, qp_id(qp), params->private_data_len); | |
1705 | ||
1706 | rv = siw_send_mpareqrep(cep, params->private_data, | |
1707 | params->private_data_len); | |
1708 | if (rv != 0) | |
1709 | goto error; | |
1710 | ||
1711 | if (wait_for_peer_rts) { | |
1712 | siw_sk_assign_rtr_upcalls(cep); | |
1713 | } else { | |
1714 | siw_qp_socket_assoc(cep, qp); | |
1715 | rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); | |
1716 | if (rv) | |
1717 | goto error; | |
1718 | } | |
1719 | siw_cep_set_free(cep); | |
1720 | ||
1721 | return 0; | |
1722 | error: | |
1723 | siw_socket_disassoc(cep->sock); | |
1724 | sock_release(cep->sock); | |
1725 | cep->sock = NULL; | |
1726 | ||
1727 | cep->state = SIW_EPSTATE_CLOSED; | |
1728 | ||
1729 | if (cep->cm_id) { | |
1730 | cep->cm_id->rem_ref(id); | |
1731 | cep->cm_id = NULL; | |
1732 | } | |
1733 | if (qp->cep) { | |
1734 | siw_cep_put(cep); | |
1735 | qp->cep = NULL; | |
1736 | } | |
1737 | cep->qp = NULL; | |
1738 | siw_qp_put(qp); | |
1739 | ||
1740 | siw_cep_set_free(cep); | |
1741 | siw_cep_put(cep); | |
1742 | ||
1743 | return rv; | |
1744 | } | |
1745 | ||
1746 | /* | |
1747 | * siw_reject() | |
1748 | * | |
1749 | * Local connection reject case. Send private data back to peer, | |
1750 | * close connection and dereference connection id. | |
1751 | */ | |
1752 | int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) | |
1753 | { | |
1754 | struct siw_cep *cep = (struct siw_cep *)id->provider_data; | |
1755 | ||
1756 | siw_cep_set_inuse(cep); | |
1757 | siw_cep_put(cep); | |
1758 | ||
1759 | siw_cancel_mpatimer(cep); | |
1760 | ||
1761 | if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { | |
1762 | siw_dbg_cep(cep, "id 0x%p: out of state\n", id); | |
1763 | ||
1764 | siw_cep_set_free(cep); | |
1765 | siw_cep_put(cep); /* put last reference */ | |
1766 | ||
1767 | return -ECONNRESET; | |
1768 | } | |
1769 | siw_dbg_cep(cep, "id 0x%p, cep->state %d, pd_len %d\n", id, cep->state, | |
1770 | pd_len); | |
1771 | ||
1772 | if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { | |
1773 | cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ | |
1774 | siw_send_mpareqrep(cep, pdata, pd_len); | |
1775 | } | |
1776 | siw_socket_disassoc(cep->sock); | |
1777 | sock_release(cep->sock); | |
1778 | cep->sock = NULL; | |
1779 | ||
1780 | cep->state = SIW_EPSTATE_CLOSED; | |
1781 | ||
1782 | siw_cep_set_free(cep); | |
1783 | siw_cep_put(cep); | |
1784 | ||
1785 | return 0; | |
1786 | } | |
1787 | ||
1788 | static int siw_listen_address(struct iw_cm_id *id, int backlog, | |
1789 | struct sockaddr *laddr, int addr_family) | |
1790 | { | |
1791 | struct socket *s; | |
1792 | struct siw_cep *cep = NULL; | |
1793 | struct siw_device *sdev = to_siw_dev(id->device); | |
1794 | int rv = 0, s_val; | |
1795 | ||
1796 | rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); | |
1797 | if (rv < 0) | |
1798 | return rv; | |
1799 | ||
1800 | /* | |
1801 | * Allow binding local port when still in TIME_WAIT from last close. | |
1802 | */ | |
1803 | s_val = 1; | |
1804 | rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, | |
1805 | sizeof(s_val)); | |
1806 | if (rv) { | |
1807 | siw_dbg(id->device, "id 0x%p: setsockopt error: %d\n", id, rv); | |
1808 | goto error; | |
1809 | } | |
1810 | rv = s->ops->bind(s, laddr, addr_family == AF_INET ? | |
1811 | sizeof(struct sockaddr_in) : | |
1812 | sizeof(struct sockaddr_in6)); | |
1813 | if (rv) { | |
1814 | siw_dbg(id->device, "id 0x%p: socket bind error: %d\n", id, rv); | |
1815 | goto error; | |
1816 | } | |
1817 | cep = siw_cep_alloc(sdev); | |
1818 | if (!cep) { | |
1819 | rv = -ENOMEM; | |
1820 | goto error; | |
1821 | } | |
1822 | siw_cep_socket_assoc(cep, s); | |
1823 | ||
1824 | rv = siw_cm_alloc_work(cep, backlog); | |
1825 | if (rv) { | |
1826 | siw_dbg(id->device, | |
1827 | "id 0x%p: alloc_work error %d, backlog %d\n", id, | |
1828 | rv, backlog); | |
1829 | goto error; | |
1830 | } | |
1831 | rv = s->ops->listen(s, backlog); | |
1832 | if (rv) { | |
1833 | siw_dbg(id->device, "id 0x%p: listen error %d\n", id, rv); | |
1834 | goto error; | |
1835 | } | |
1836 | cep->cm_id = id; | |
1837 | id->add_ref(id); | |
1838 | ||
1839 | /* | |
1840 | * In case of a wildcard rdma_listen on a multi-homed device, | |
1841 | * a listener's IWCM id is associated with more than one listening CEP. | |
1842 | * | |
1843 | * We currently use id->provider_data in three different ways: | |
1844 | * | |
1845 | * o For a listener's IWCM id, id->provider_data points to | |
1846 | * the list_head of the list of listening CEPs. | |
1847 | * Uses: siw_create_listen(), siw_destroy_listen() | |
1848 | * | |
1849 | * o For each accepted passive-side IWCM id, id->provider_data | |
1850 | * points to the CEP itself. This is a consequence of | |
1851 | * - siw_cm_upcall() setting event.provider_data = cep and | |
1852 | * - the IWCM's cm_conn_req_handler() setting provider_data of the | |
1853 | * new passive-side IWCM id equal to event.provider_data | |
1854 | * Uses: siw_accept(), siw_reject() | |
1855 | * | |
1856 | * o For an active-side IWCM id, id->provider_data is not used at all. | |
1857 | * | |
1858 | */ | |
1859 | if (!id->provider_data) { | |
1860 | id->provider_data = | |
1861 | kmalloc(sizeof(struct list_head), GFP_KERNEL); | |
1862 | if (!id->provider_data) { | |
1863 | rv = -ENOMEM; | |
1864 | goto error; | |
1865 | } | |
1866 | INIT_LIST_HEAD((struct list_head *)id->provider_data); | |
1867 | } | |
1868 | list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); | |
1869 | cep->state = SIW_EPSTATE_LISTENING; | |
1870 | ||
1871 | if (addr_family == AF_INET) | |
1872 | siw_dbg(id->device, "Listen at laddr %pI4 %u\n", | |
1873 | &(((struct sockaddr_in *)laddr)->sin_addr), | |
1874 | ((struct sockaddr_in *)laddr)->sin_port); | |
1875 | else | |
1876 | siw_dbg(id->device, "Listen at laddr %pI6 %u\n", | |
1877 | &(((struct sockaddr_in6 *)laddr)->sin6_addr), | |
1878 | ((struct sockaddr_in6 *)laddr)->sin6_port); | |
1879 | ||
1880 | return 0; | |
1881 | ||
1882 | error: | |
1883 | siw_dbg(id->device, "failed: %d\n", rv); | |
1884 | ||
1885 | if (cep) { | |
1886 | siw_cep_set_inuse(cep); | |
1887 | ||
1888 | if (cep->cm_id) { | |
1889 | cep->cm_id->rem_ref(cep->cm_id); | |
1890 | cep->cm_id = NULL; | |
1891 | } | |
1892 | cep->sock = NULL; | |
1893 | siw_socket_disassoc(s); | |
1894 | cep->state = SIW_EPSTATE_CLOSED; | |
1895 | ||
1896 | siw_cep_set_free(cep); | |
1897 | siw_cep_put(cep); | |
1898 | } | |
1899 | sock_release(s); | |
1900 | ||
1901 | return rv; | |
1902 | } | |
1903 | ||
1904 | static void siw_drop_listeners(struct iw_cm_id *id) | |
1905 | { | |
1906 | struct list_head *p, *tmp; | |
1907 | ||
1908 | /* | |
1909 | * In case of a wildcard rdma_listen on a multi-homed device, | |
1910 | * a listener's IWCM id is associated with more than one listening CEP. | |
1911 | */ | |
1912 | list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { | |
1913 | struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); | |
1914 | ||
1915 | list_del(p); | |
1916 | ||
1917 | siw_dbg_cep(cep, "id 0x%p: drop cep, state %d\n", id, | |
1918 | cep->state); | |
1919 | ||
1920 | siw_cep_set_inuse(cep); | |
1921 | ||
1922 | if (cep->cm_id) { | |
1923 | cep->cm_id->rem_ref(cep->cm_id); | |
1924 | cep->cm_id = NULL; | |
1925 | } | |
1926 | if (cep->sock) { | |
1927 | siw_socket_disassoc(cep->sock); | |
1928 | sock_release(cep->sock); | |
1929 | cep->sock = NULL; | |
1930 | } | |
1931 | cep->state = SIW_EPSTATE_CLOSED; | |
1932 | siw_cep_set_free(cep); | |
1933 | siw_cep_put(cep); | |
1934 | } | |
1935 | } | |
1936 | ||
1937 | /* | |
1938 | * siw_create_listen - Create resources for a listener's IWCM ID @id | |
1939 | * | |
1940 | * Listens on the socket addresses id->local_addr and id->remote_addr. | |
1941 | * | |
1942 | * If the listener's @id provides a specific local IP address, at most one | |
1943 | * listening socket is created and associated with @id. | |
1944 | * | |
1945 | * If the listener's @id provides the wildcard (zero) local IP address, | |
1946 | * a separate listen is performed for each local IP address of the device | |
1947 | * by creating a listening socket and binding to that local IP address. | |
1948 | * | |
1949 | */ | |
1950 | int siw_create_listen(struct iw_cm_id *id, int backlog) | |
1951 | { | |
1952 | struct net_device *dev = to_siw_dev(id->device)->netdev; | |
1953 | int rv = 0, listeners = 0; | |
1954 | ||
1955 | siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog); | |
1956 | ||
1957 | /* | |
1958 | * For each attached address of the interface, create a | |
1959 | * listening socket, if id->local_addr is the wildcard | |
1960 | * IP address or matches the IP address. | |
1961 | */ | |
1962 | if (id->local_addr.ss_family == AF_INET) { | |
1963 | struct in_device *in_dev = in_dev_get(dev); | |
1964 | struct sockaddr_in s_laddr, *s_raddr; | |
2a3c389a | 1965 | const struct in_ifaddr *ifa; |
6c52fdc2 BM |
1966 | |
1967 | memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr)); | |
1968 | s_raddr = (struct sockaddr_in *)&id->remote_addr; | |
1969 | ||
1970 | siw_dbg(id->device, | |
1971 | "id 0x%p: laddr %pI4:%d, raddr %pI4:%d\n", | |
1972 | id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port), | |
1973 | &s_raddr->sin_addr, ntohs(s_raddr->sin_port)); | |
1974 | ||
c421651f | 1975 | rtnl_lock(); |
2a3c389a | 1976 | in_dev_for_each_ifa_rtnl(ifa, in_dev) { |
6c52fdc2 BM |
1977 | if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) || |
1978 | s_laddr.sin_addr.s_addr == ifa->ifa_address) { | |
1979 | s_laddr.sin_addr.s_addr = ifa->ifa_address; | |
1980 | ||
1981 | rv = siw_listen_address(id, backlog, | |
1982 | (struct sockaddr *)&s_laddr, | |
1983 | AF_INET); | |
1984 | if (!rv) | |
1985 | listeners++; | |
1986 | } | |
1987 | } | |
c421651f | 1988 | rtnl_unlock(); |
6c52fdc2 BM |
1989 | in_dev_put(in_dev); |
1990 | } else if (id->local_addr.ss_family == AF_INET6) { | |
1991 | struct inet6_dev *in6_dev = in6_dev_get(dev); | |
1992 | struct inet6_ifaddr *ifp; | |
1993 | struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr), | |
1994 | *s_raddr = &to_sockaddr_in6(id->remote_addr); | |
1995 | ||
1996 | siw_dbg(id->device, | |
1997 | "id 0x%p: laddr %pI6:%d, raddr %pI6:%d\n", | |
1998 | id, &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port), | |
1999 | &s_raddr->sin6_addr, ntohs(s_raddr->sin6_port)); | |
2000 | ||
2001 | read_lock_bh(&in6_dev->lock); | |
2002 | list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { | |
2003 | struct sockaddr_in6 bind_addr; | |
2004 | ||
2005 | if (ipv6_addr_any(&s_laddr->sin6_addr) || | |
2006 | ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) { | |
2007 | bind_addr.sin6_family = AF_INET6; | |
2008 | bind_addr.sin6_port = s_laddr->sin6_port; | |
2009 | bind_addr.sin6_flowinfo = 0; | |
2010 | bind_addr.sin6_addr = ifp->addr; | |
2011 | bind_addr.sin6_scope_id = dev->ifindex; | |
2012 | ||
2013 | rv = siw_listen_address(id, backlog, | |
2014 | (struct sockaddr *)&bind_addr, | |
2015 | AF_INET6); | |
2016 | if (!rv) | |
2017 | listeners++; | |
2018 | } | |
2019 | } | |
2020 | read_unlock_bh(&in6_dev->lock); | |
2021 | ||
2022 | in6_dev_put(in6_dev); | |
2023 | } else { | |
2024 | return -EAFNOSUPPORT; | |
2025 | } | |
2026 | if (listeners) | |
2027 | rv = 0; | |
2028 | else if (!rv) | |
2029 | rv = -EINVAL; | |
2030 | ||
2031 | siw_dbg(id->device, "id 0x%p: %s\n", id, rv ? "FAIL" : "OK"); | |
2032 | ||
2033 | return rv; | |
2034 | } | |
2035 | ||
2036 | int siw_destroy_listen(struct iw_cm_id *id) | |
2037 | { | |
2038 | siw_dbg(id->device, "id 0x%p\n", id); | |
2039 | ||
2040 | if (!id->provider_data) { | |
2041 | siw_dbg(id->device, "id 0x%p: no cep(s)\n", id); | |
2042 | return 0; | |
2043 | } | |
2044 | siw_drop_listeners(id); | |
2045 | kfree(id->provider_data); | |
2046 | id->provider_data = NULL; | |
2047 | ||
2048 | return 0; | |
2049 | } | |
2050 | ||
2051 | int siw_cm_init(void) | |
2052 | { | |
2053 | /* | |
2054 | * create_single_workqueue for strict ordering | |
2055 | */ | |
2056 | siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); | |
2057 | if (!siw_cm_wq) | |
2058 | return -ENOMEM; | |
2059 | ||
2060 | return 0; | |
2061 | } | |
2062 | ||
2063 | void siw_cm_exit(void) | |
2064 | { | |
2065 | if (siw_cm_wq) { | |
2066 | flush_workqueue(siw_cm_wq); | |
2067 | destroy_workqueue(siw_cm_wq); | |
2068 | } | |
2069 | } |