Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64...
[linux-2.6-block.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <linux/in.h>
30 #include <linux/sched/signal.h>
31
32 #include <net/sock.h>
33 #include <net/tcp.h>
34 #include <net/smc.h>
35
36 #include "smc.h"
37 #include "smc_clc.h"
38 #include "smc_llc.h"
39 #include "smc_cdc.h"
40 #include "smc_core.h"
41 #include "smc_ib.h"
42 #include "smc_pnet.h"
43 #include "smc_tx.h"
44 #include "smc_rx.h"
45 #include "smc_close.h"
46
47 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
48                                                  * creation
49                                                  */
50
51 struct smc_lgr_list smc_lgr_list = {            /* established link groups */
52         .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53         .list = LIST_HEAD_INIT(smc_lgr_list.list),
54 };
55
56 static void smc_tcp_listen_work(struct work_struct *);
57
58 static void smc_set_keepalive(struct sock *sk, int val)
59 {
60         struct smc_sock *smc = smc_sk(sk);
61
62         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63 }
64
65 static struct smc_hashinfo smc_v4_hashinfo = {
66         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67 };
68
69 int smc_hash_sk(struct sock *sk)
70 {
71         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72         struct hlist_head *head;
73
74         head = &h->ht;
75
76         write_lock_bh(&h->lock);
77         sk_add_node(sk, head);
78         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79         write_unlock_bh(&h->lock);
80
81         return 0;
82 }
83 EXPORT_SYMBOL_GPL(smc_hash_sk);
84
85 void smc_unhash_sk(struct sock *sk)
86 {
87         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88
89         write_lock_bh(&h->lock);
90         if (sk_del_node_init(sk))
91                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92         write_unlock_bh(&h->lock);
93 }
94 EXPORT_SYMBOL_GPL(smc_unhash_sk);
95
96 struct proto smc_proto = {
97         .name           = "SMC",
98         .owner          = THIS_MODULE,
99         .keepalive      = smc_set_keepalive,
100         .hash           = smc_hash_sk,
101         .unhash         = smc_unhash_sk,
102         .obj_size       = sizeof(struct smc_sock),
103         .h.smc_hash     = &smc_v4_hashinfo,
104         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
105 };
106 EXPORT_SYMBOL_GPL(smc_proto);
107
108 static int smc_release(struct socket *sock)
109 {
110         struct sock *sk = sock->sk;
111         struct smc_sock *smc;
112         int rc = 0;
113
114         if (!sk)
115                 goto out;
116
117         smc = smc_sk(sk);
118         sock_hold(sk);
119         if (sk->sk_state == SMC_LISTEN)
120                 /* smc_close_non_accepted() is called and acquires
121                  * sock lock for child sockets again
122                  */
123                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
124         else
125                 lock_sock(sk);
126
127         if (smc->use_fallback) {
128                 sk->sk_state = SMC_CLOSED;
129                 sk->sk_state_change(sk);
130         } else {
131                 rc = smc_close_active(smc);
132                 sock_set_flag(sk, SOCK_DEAD);
133                 sk->sk_shutdown |= SHUTDOWN_MASK;
134         }
135         if (smc->clcsock) {
136                 sock_release(smc->clcsock);
137                 smc->clcsock = NULL;
138         }
139
140         /* detach socket */
141         sock_orphan(sk);
142         sock->sk = NULL;
143         if (smc->use_fallback) {
144                 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
145         } else if (sk->sk_state == SMC_CLOSED) {
146                 smc_conn_free(&smc->conn);
147                 schedule_delayed_work(&smc->sock_put_work,
148                                       SMC_CLOSE_SOCK_PUT_DELAY);
149         }
150         release_sock(sk);
151
152         sock_put(sk);
153 out:
154         return rc;
155 }
156
157 static void smc_destruct(struct sock *sk)
158 {
159         if (sk->sk_state != SMC_CLOSED)
160                 return;
161         if (!sock_flag(sk, SOCK_DEAD))
162                 return;
163
164         sk_refcnt_debug_dec(sk);
165 }
166
167 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
168 {
169         struct smc_sock *smc;
170         struct sock *sk;
171
172         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
173         if (!sk)
174                 return NULL;
175
176         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
177         sk->sk_state = SMC_INIT;
178         sk->sk_destruct = smc_destruct;
179         sk->sk_protocol = SMCPROTO_SMC;
180         smc = smc_sk(sk);
181         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
182         INIT_LIST_HEAD(&smc->accept_q);
183         spin_lock_init(&smc->accept_q_lock);
184         INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
185         sk->sk_prot->hash(sk);
186         sk_refcnt_debug_inc(sk);
187
188         return sk;
189 }
190
191 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
192                     int addr_len)
193 {
194         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
195         struct sock *sk = sock->sk;
196         struct smc_sock *smc;
197         int rc;
198
199         smc = smc_sk(sk);
200
201         /* replicate tests from inet_bind(), to be safe wrt. future changes */
202         rc = -EINVAL;
203         if (addr_len < sizeof(struct sockaddr_in))
204                 goto out;
205
206         rc = -EAFNOSUPPORT;
207         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
208         if ((addr->sin_family != AF_INET) &&
209             ((addr->sin_family != AF_UNSPEC) ||
210              (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
211                 goto out;
212
213         lock_sock(sk);
214
215         /* Check if socket is already active */
216         rc = -EINVAL;
217         if (sk->sk_state != SMC_INIT)
218                 goto out_rel;
219
220         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
221         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
222
223 out_rel:
224         release_sock(sk);
225 out:
226         return rc;
227 }
228
229 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
230                                    unsigned long mask)
231 {
232         /* options we don't get control via setsockopt for */
233         nsk->sk_type = osk->sk_type;
234         nsk->sk_sndbuf = osk->sk_sndbuf;
235         nsk->sk_rcvbuf = osk->sk_rcvbuf;
236         nsk->sk_sndtimeo = osk->sk_sndtimeo;
237         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
238         nsk->sk_mark = osk->sk_mark;
239         nsk->sk_priority = osk->sk_priority;
240         nsk->sk_rcvlowat = osk->sk_rcvlowat;
241         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
242         nsk->sk_err = osk->sk_err;
243
244         nsk->sk_flags &= ~mask;
245         nsk->sk_flags |= osk->sk_flags & mask;
246 }
247
248 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
249                              (1UL << SOCK_KEEPOPEN) | \
250                              (1UL << SOCK_LINGER) | \
251                              (1UL << SOCK_BROADCAST) | \
252                              (1UL << SOCK_TIMESTAMP) | \
253                              (1UL << SOCK_DBG) | \
254                              (1UL << SOCK_RCVTSTAMP) | \
255                              (1UL << SOCK_RCVTSTAMPNS) | \
256                              (1UL << SOCK_LOCALROUTE) | \
257                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
258                              (1UL << SOCK_RXQ_OVFL) | \
259                              (1UL << SOCK_WIFI_STATUS) | \
260                              (1UL << SOCK_NOFCS) | \
261                              (1UL << SOCK_FILTER_LOCKED))
262 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
263  * clc socket (since smc is not called for these options from net/core)
264  */
265 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
266 {
267         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
268 }
269
270 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
271                              (1UL << SOCK_KEEPOPEN) | \
272                              (1UL << SOCK_LINGER) | \
273                              (1UL << SOCK_DBG))
274 /* copy only settings and flags relevant for smc from clc to smc socket */
275 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
276 {
277         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
278 }
279
280 /* determine subnet and mask of internal TCP socket */
281 int smc_netinfo_by_tcpsk(struct socket *clcsock,
282                          __be32 *subnet, u8 *prefix_len)
283 {
284         struct dst_entry *dst = sk_dst_get(clcsock->sk);
285         struct in_device *in_dev;
286         struct sockaddr_in addr;
287         int rc = -ENOENT;
288         int len;
289
290         if (!dst) {
291                 rc = -ENOTCONN;
292                 goto out;
293         }
294         if (!dst->dev) {
295                 rc = -ENODEV;
296                 goto out_rel;
297         }
298
299         /* get address to which the internal TCP socket is bound */
300         kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
301         /* analyze IPv4 specific data of net_device belonging to TCP socket */
302         rcu_read_lock();
303         in_dev = __in_dev_get_rcu(dst->dev);
304         for_ifa(in_dev) {
305                 if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
306                         continue;
307                 *prefix_len = inet_mask_len(ifa->ifa_mask);
308                 *subnet = ifa->ifa_address & ifa->ifa_mask;
309                 rc = 0;
310                 break;
311         } endfor_ifa(in_dev);
312         rcu_read_unlock();
313
314 out_rel:
315         dst_release(dst);
316 out:
317         return rc;
318 }
319
320 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
321 {
322         struct smc_link_group *lgr = smc->conn.lgr;
323         struct smc_link *link;
324         int rest;
325         int rc;
326
327         link = &lgr->lnk[SMC_SINGLE_LINK];
328         /* receive CONFIRM LINK request from server over RoCE fabric */
329         rest = wait_for_completion_interruptible_timeout(
330                 &link->llc_confirm,
331                 SMC_LLC_WAIT_FIRST_TIME);
332         if (rest <= 0) {
333                 struct smc_clc_msg_decline dclc;
334
335                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
336                                       SMC_CLC_DECLINE);
337                 return rc;
338         }
339
340         rc = smc_ib_modify_qp_rts(link);
341         if (rc)
342                 return SMC_CLC_DECL_INTERR;
343
344         smc_wr_remember_qp_attr(link);
345
346         rc = smc_wr_reg_send(link,
347                              smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
348         if (rc)
349                 return SMC_CLC_DECL_INTERR;
350
351         /* send CONFIRM LINK response over RoCE fabric */
352         rc = smc_llc_send_confirm_link(link,
353                                        link->smcibdev->mac[link->ibport - 1],
354                                        gid, SMC_LLC_RESP);
355         if (rc < 0)
356                 return SMC_CLC_DECL_TCL;
357
358         return rc;
359 }
360
361 static void smc_conn_save_peer_info(struct smc_sock *smc,
362                                     struct smc_clc_msg_accept_confirm *clc)
363 {
364         smc->conn.peer_conn_idx = clc->conn_idx;
365         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
366         smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
367         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
368 }
369
370 static void smc_link_save_peer_info(struct smc_link *link,
371                                     struct smc_clc_msg_accept_confirm *clc)
372 {
373         link->peer_qpn = ntoh24(clc->qpn);
374         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
375         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
376         link->peer_psn = ntoh24(clc->psn);
377         link->peer_mtu = clc->qp_mtu;
378 }
379
380 /* setup for RDMA connection of client */
381 static int smc_connect_rdma(struct smc_sock *smc)
382 {
383         struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
384         struct smc_clc_msg_accept_confirm aclc;
385         int local_contact = SMC_FIRST_CONTACT;
386         struct smc_ib_device *smcibdev;
387         struct smc_link *link;
388         u8 srv_first_contact;
389         int reason_code = 0;
390         int rc = 0;
391         u8 ibport;
392
393         /* IPSec connections opt out of SMC-R optimizations */
394         if (using_ipsec(smc)) {
395                 reason_code = SMC_CLC_DECL_IPSEC;
396                 goto decline_rdma;
397         }
398
399         /* PNET table look up: search active ib_device and port
400          * within same PNETID that also contains the ethernet device
401          * used for the internal TCP socket
402          */
403         smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
404         if (!smcibdev) {
405                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
406                 goto decline_rdma;
407         }
408
409         /* do inband token exchange */
410         reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
411         if (reason_code < 0) {
412                 rc = reason_code;
413                 goto out_err;
414         }
415         if (reason_code > 0) /* configuration error */
416                 goto decline_rdma;
417         /* receive SMC Accept CLC message */
418         reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
419                                        SMC_CLC_ACCEPT);
420         if (reason_code < 0) {
421                 rc = reason_code;
422                 goto out_err;
423         }
424         if (reason_code > 0)
425                 goto decline_rdma;
426
427         srv_first_contact = aclc.hdr.flag;
428         mutex_lock(&smc_create_lgr_pending);
429         local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
430                                         ibport, &aclc.lcl, srv_first_contact);
431         if (local_contact < 0) {
432                 rc = local_contact;
433                 if (rc == -ENOMEM)
434                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
435                 else if (rc == -ENOLINK)
436                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
437                 goto decline_rdma_unlock;
438         }
439         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
440
441         smc_conn_save_peer_info(smc, &aclc);
442
443         /* create send buffer and rmb */
444         rc = smc_buf_create(smc);
445         if (rc) {
446                 reason_code = SMC_CLC_DECL_MEM;
447                 goto decline_rdma_unlock;
448         }
449
450         if (local_contact == SMC_FIRST_CONTACT)
451                 smc_link_save_peer_info(link, &aclc);
452
453         rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
454         if (rc) {
455                 reason_code = SMC_CLC_DECL_INTERR;
456                 goto decline_rdma_unlock;
457         }
458
459         smc_close_init(smc);
460         smc_rx_init(smc);
461
462         if (local_contact == SMC_FIRST_CONTACT) {
463                 rc = smc_ib_ready_link(link);
464                 if (rc) {
465                         reason_code = SMC_CLC_DECL_INTERR;
466                         goto decline_rdma_unlock;
467                 }
468         } else {
469                 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
470
471                 if (!buf_desc->reused) {
472                         /* register memory region for new rmb */
473                         rc = smc_wr_reg_send(link,
474                                              buf_desc->mr_rx[SMC_SINGLE_LINK]);
475                         if (rc) {
476                                 reason_code = SMC_CLC_DECL_INTERR;
477                                 goto decline_rdma_unlock;
478                         }
479                 }
480         }
481         smc_rmb_sync_sg_for_device(&smc->conn);
482
483         rc = smc_clc_send_confirm(smc);
484         if (rc)
485                 goto out_err_unlock;
486
487         if (local_contact == SMC_FIRST_CONTACT) {
488                 /* QP confirmation over RoCE fabric */
489                 reason_code = smc_clnt_conf_first_link(
490                         smc, &smcibdev->gid[ibport - 1]);
491                 if (reason_code < 0) {
492                         rc = reason_code;
493                         goto out_err_unlock;
494                 }
495                 if (reason_code > 0)
496                         goto decline_rdma_unlock;
497         }
498
499         mutex_unlock(&smc_create_lgr_pending);
500         smc_tx_init(smc);
501
502 out_connected:
503         smc_copy_sock_settings_to_clc(smc);
504         if (smc->sk.sk_state == SMC_INIT)
505                 smc->sk.sk_state = SMC_ACTIVE;
506
507         return rc ? rc : local_contact;
508
509 decline_rdma_unlock:
510         mutex_unlock(&smc_create_lgr_pending);
511         smc_conn_free(&smc->conn);
512 decline_rdma:
513         /* RDMA setup failed, switch back to TCP */
514         smc->use_fallback = true;
515         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
516                 rc = smc_clc_send_decline(smc, reason_code);
517                 if (rc < sizeof(struct smc_clc_msg_decline))
518                         goto out_err;
519         }
520         goto out_connected;
521
522 out_err_unlock:
523         mutex_unlock(&smc_create_lgr_pending);
524         smc_conn_free(&smc->conn);
525 out_err:
526         return rc;
527 }
528
529 static int smc_connect(struct socket *sock, struct sockaddr *addr,
530                        int alen, int flags)
531 {
532         struct sock *sk = sock->sk;
533         struct smc_sock *smc;
534         int rc = -EINVAL;
535
536         smc = smc_sk(sk);
537
538         /* separate smc parameter checking to be safe */
539         if (alen < sizeof(addr->sa_family))
540                 goto out_err;
541         if (addr->sa_family != AF_INET)
542                 goto out_err;
543         smc->addr = addr;       /* needed for nonblocking connect */
544
545         lock_sock(sk);
546         switch (sk->sk_state) {
547         default:
548                 goto out;
549         case SMC_ACTIVE:
550                 rc = -EISCONN;
551                 goto out;
552         case SMC_INIT:
553                 rc = 0;
554                 break;
555         }
556
557         smc_copy_sock_settings_to_clc(smc);
558         rc = kernel_connect(smc->clcsock, addr, alen, flags);
559         if (rc)
560                 goto out;
561
562         /* setup RDMA connection */
563         rc = smc_connect_rdma(smc);
564         if (rc < 0)
565                 goto out;
566         else
567                 rc = 0; /* success cases including fallback */
568
569 out:
570         release_sock(sk);
571 out_err:
572         return rc;
573 }
574
575 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
576 {
577         struct sock *sk = &lsmc->sk;
578         struct socket *new_clcsock;
579         struct sock *new_sk;
580         int rc;
581
582         release_sock(&lsmc->sk);
583         new_sk = smc_sock_alloc(sock_net(sk), NULL);
584         if (!new_sk) {
585                 rc = -ENOMEM;
586                 lsmc->sk.sk_err = ENOMEM;
587                 *new_smc = NULL;
588                 lock_sock(&lsmc->sk);
589                 goto out;
590         }
591         *new_smc = smc_sk(new_sk);
592
593         rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
594         lock_sock(&lsmc->sk);
595         if  (rc < 0) {
596                 lsmc->sk.sk_err = -rc;
597                 new_sk->sk_state = SMC_CLOSED;
598                 sock_set_flag(new_sk, SOCK_DEAD);
599                 sk->sk_prot->unhash(new_sk);
600                 sock_put(new_sk);
601                 *new_smc = NULL;
602                 goto out;
603         }
604         if (lsmc->sk.sk_state == SMC_CLOSED) {
605                 if (new_clcsock)
606                         sock_release(new_clcsock);
607                 new_sk->sk_state = SMC_CLOSED;
608                 sock_set_flag(new_sk, SOCK_DEAD);
609                 sk->sk_prot->unhash(new_sk);
610                 sock_put(new_sk);
611                 *new_smc = NULL;
612                 goto out;
613         }
614
615         (*new_smc)->clcsock = new_clcsock;
616 out:
617         return rc;
618 }
619
620 /* add a just created sock to the accept queue of the listen sock as
621  * candidate for a following socket accept call from user space
622  */
623 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
624 {
625         struct smc_sock *par = smc_sk(parent);
626
627         sock_hold(sk);
628         spin_lock(&par->accept_q_lock);
629         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
630         spin_unlock(&par->accept_q_lock);
631         sk_acceptq_added(parent);
632 }
633
634 /* remove a socket from the accept queue of its parental listening socket */
635 static void smc_accept_unlink(struct sock *sk)
636 {
637         struct smc_sock *par = smc_sk(sk)->listen_smc;
638
639         spin_lock(&par->accept_q_lock);
640         list_del_init(&smc_sk(sk)->accept_q);
641         spin_unlock(&par->accept_q_lock);
642         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
643         sock_put(sk);
644 }
645
646 /* remove a sock from the accept queue to bind it to a new socket created
647  * for a socket accept call from user space
648  */
649 struct sock *smc_accept_dequeue(struct sock *parent,
650                                 struct socket *new_sock)
651 {
652         struct smc_sock *isk, *n;
653         struct sock *new_sk;
654
655         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
656                 new_sk = (struct sock *)isk;
657
658                 smc_accept_unlink(new_sk);
659                 if (new_sk->sk_state == SMC_CLOSED) {
660                         new_sk->sk_prot->unhash(new_sk);
661                         sock_put(new_sk);
662                         continue;
663                 }
664                 if (new_sock)
665                         sock_graft(new_sk, new_sock);
666                 return new_sk;
667         }
668         return NULL;
669 }
670
671 /* clean up for a created but never accepted sock */
672 void smc_close_non_accepted(struct sock *sk)
673 {
674         struct smc_sock *smc = smc_sk(sk);
675
676         sock_hold(sk);
677         lock_sock(sk);
678         if (!sk->sk_lingertime)
679                 /* wait for peer closing */
680                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
681         if (smc->use_fallback) {
682                 sk->sk_state = SMC_CLOSED;
683         } else {
684                 smc_close_active(smc);
685                 sock_set_flag(sk, SOCK_DEAD);
686                 sk->sk_shutdown |= SHUTDOWN_MASK;
687         }
688         if (smc->clcsock) {
689                 struct socket *tcp;
690
691                 tcp = smc->clcsock;
692                 smc->clcsock = NULL;
693                 sock_release(tcp);
694         }
695         if (smc->use_fallback) {
696                 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
697         } else if (sk->sk_state == SMC_CLOSED) {
698                 smc_conn_free(&smc->conn);
699                 schedule_delayed_work(&smc->sock_put_work,
700                                       SMC_CLOSE_SOCK_PUT_DELAY);
701         }
702         release_sock(sk);
703         sock_put(sk);
704 }
705
706 static int smc_serv_conf_first_link(struct smc_sock *smc)
707 {
708         struct smc_link_group *lgr = smc->conn.lgr;
709         struct smc_link *link;
710         int rest;
711         int rc;
712
713         link = &lgr->lnk[SMC_SINGLE_LINK];
714
715         rc = smc_wr_reg_send(link,
716                              smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
717         if (rc)
718                 return SMC_CLC_DECL_INTERR;
719
720         /* send CONFIRM LINK request to client over the RoCE fabric */
721         rc = smc_llc_send_confirm_link(link,
722                                        link->smcibdev->mac[link->ibport - 1],
723                                        &link->smcibdev->gid[link->ibport - 1],
724                                        SMC_LLC_REQ);
725         if (rc < 0)
726                 return SMC_CLC_DECL_TCL;
727
728         /* receive CONFIRM LINK response from client over the RoCE fabric */
729         rest = wait_for_completion_interruptible_timeout(
730                 &link->llc_confirm_resp,
731                 SMC_LLC_WAIT_FIRST_TIME);
732         if (rest <= 0) {
733                 struct smc_clc_msg_decline dclc;
734
735                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
736                                       SMC_CLC_DECLINE);
737         }
738
739         return rc;
740 }
741
742 /* setup for RDMA connection of server */
743 static void smc_listen_work(struct work_struct *work)
744 {
745         struct smc_sock *new_smc = container_of(work, struct smc_sock,
746                                                 smc_listen_work);
747         struct socket *newclcsock = new_smc->clcsock;
748         struct smc_sock *lsmc = new_smc->listen_smc;
749         struct smc_clc_msg_accept_confirm cclc;
750         int local_contact = SMC_REUSE_CONTACT;
751         struct sock *newsmcsk = &new_smc->sk;
752         struct smc_clc_msg_proposal pclc;
753         struct smc_ib_device *smcibdev;
754         struct sockaddr_in peeraddr;
755         struct smc_link *link;
756         int reason_code = 0;
757         int rc = 0, len;
758         __be32 subnet;
759         u8 prefix_len;
760         u8 ibport;
761
762         /* do inband token exchange -
763          *wait for and receive SMC Proposal CLC message
764          */
765         reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
766                                        SMC_CLC_PROPOSAL);
767         if (reason_code < 0)
768                 goto out_err;
769         if (reason_code > 0)
770                 goto decline_rdma;
771
772         /* IPSec connections opt out of SMC-R optimizations */
773         if (using_ipsec(new_smc)) {
774                 reason_code = SMC_CLC_DECL_IPSEC;
775                 goto decline_rdma;
776         }
777
778         /* PNET table look up: search active ib_device and port
779          * within same PNETID that also contains the ethernet device
780          * used for the internal TCP socket
781          */
782         smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
783         if (!smcibdev) {
784                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
785                 goto decline_rdma;
786         }
787
788         /* determine subnet and mask from internal TCP socket */
789         rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
790         if (rc) {
791                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
792                 goto decline_rdma;
793         }
794         if ((pclc.outgoing_subnet != subnet) ||
795             (pclc.prefix_len != prefix_len)) {
796                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
797                 goto decline_rdma;
798         }
799
800         /* get address of the peer connected to the internal TCP socket */
801         kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
802
803         /* allocate connection / link group */
804         mutex_lock(&smc_create_lgr_pending);
805         local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
806                                         smcibdev, ibport, &pclc.lcl, 0);
807         if (local_contact < 0) {
808                 rc = local_contact;
809                 if (rc == -ENOMEM)
810                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
811                 goto decline_rdma;
812         }
813         link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
814
815         /* create send buffer and rmb */
816         rc = smc_buf_create(new_smc);
817         if (rc) {
818                 reason_code = SMC_CLC_DECL_MEM;
819                 goto decline_rdma;
820         }
821
822         smc_close_init(new_smc);
823         smc_rx_init(new_smc);
824
825         if (local_contact != SMC_FIRST_CONTACT) {
826                 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
827
828                 if (!buf_desc->reused) {
829                         /* register memory region for new rmb */
830                         rc = smc_wr_reg_send(link,
831                                              buf_desc->mr_rx[SMC_SINGLE_LINK]);
832                         if (rc) {
833                                 reason_code = SMC_CLC_DECL_INTERR;
834                                 goto decline_rdma;
835                         }
836                 }
837         }
838         smc_rmb_sync_sg_for_device(&new_smc->conn);
839
840         rc = smc_clc_send_accept(new_smc, local_contact);
841         if (rc)
842                 goto out_err;
843
844         /* receive SMC Confirm CLC message */
845         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
846                                        SMC_CLC_CONFIRM);
847         if (reason_code < 0)
848                 goto out_err;
849         if (reason_code > 0)
850                 goto decline_rdma;
851         smc_conn_save_peer_info(new_smc, &cclc);
852         if (local_contact == SMC_FIRST_CONTACT)
853                 smc_link_save_peer_info(link, &cclc);
854
855         rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
856         if (rc) {
857                 reason_code = SMC_CLC_DECL_INTERR;
858                 goto decline_rdma;
859         }
860
861         if (local_contact == SMC_FIRST_CONTACT) {
862                 rc = smc_ib_ready_link(link);
863                 if (rc) {
864                         reason_code = SMC_CLC_DECL_INTERR;
865                         goto decline_rdma;
866                 }
867                 /* QP confirmation over RoCE fabric */
868                 reason_code = smc_serv_conf_first_link(new_smc);
869                 if (reason_code < 0) {
870                         /* peer is not aware of a problem */
871                         rc = reason_code;
872                         goto out_err;
873                 }
874                 if (reason_code > 0)
875                         goto decline_rdma;
876         }
877
878         smc_tx_init(new_smc);
879
880 out_connected:
881         sk_refcnt_debug_inc(newsmcsk);
882         if (newsmcsk->sk_state == SMC_INIT)
883                 newsmcsk->sk_state = SMC_ACTIVE;
884 enqueue:
885         mutex_unlock(&smc_create_lgr_pending);
886         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
887         if (lsmc->sk.sk_state == SMC_LISTEN) {
888                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
889         } else { /* no longer listening */
890                 smc_close_non_accepted(newsmcsk);
891         }
892         release_sock(&lsmc->sk);
893
894         /* Wake up accept */
895         lsmc->sk.sk_data_ready(&lsmc->sk);
896         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
897         return;
898
899 decline_rdma:
900         /* RDMA setup failed, switch back to TCP */
901         smc_conn_free(&new_smc->conn);
902         new_smc->use_fallback = true;
903         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
904                 rc = smc_clc_send_decline(new_smc, reason_code);
905                 if (rc < sizeof(struct smc_clc_msg_decline))
906                         goto out_err;
907         }
908         goto out_connected;
909
910 out_err:
911         newsmcsk->sk_state = SMC_CLOSED;
912         smc_conn_free(&new_smc->conn);
913         goto enqueue; /* queue new sock with sk_err set */
914 }
915
916 static void smc_tcp_listen_work(struct work_struct *work)
917 {
918         struct smc_sock *lsmc = container_of(work, struct smc_sock,
919                                              tcp_listen_work);
920         struct smc_sock *new_smc;
921         int rc = 0;
922
923         lock_sock(&lsmc->sk);
924         while (lsmc->sk.sk_state == SMC_LISTEN) {
925                 rc = smc_clcsock_accept(lsmc, &new_smc);
926                 if (rc)
927                         goto out;
928                 if (!new_smc)
929                         continue;
930
931                 new_smc->listen_smc = lsmc;
932                 new_smc->use_fallback = false; /* assume rdma capability first*/
933                 sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
934                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
935                 smc_copy_sock_settings_to_smc(new_smc);
936                 schedule_work(&new_smc->smc_listen_work);
937         }
938
939 out:
940         release_sock(&lsmc->sk);
941         lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
942 }
943
944 static int smc_listen(struct socket *sock, int backlog)
945 {
946         struct sock *sk = sock->sk;
947         struct smc_sock *smc;
948         int rc;
949
950         smc = smc_sk(sk);
951         lock_sock(sk);
952
953         rc = -EINVAL;
954         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
955                 goto out;
956
957         rc = 0;
958         if (sk->sk_state == SMC_LISTEN) {
959                 sk->sk_max_ack_backlog = backlog;
960                 goto out;
961         }
962         /* some socket options are handled in core, so we could not apply
963          * them to the clc socket -- copy smc socket options to clc socket
964          */
965         smc_copy_sock_settings_to_clc(smc);
966
967         rc = kernel_listen(smc->clcsock, backlog);
968         if (rc)
969                 goto out;
970         sk->sk_max_ack_backlog = backlog;
971         sk->sk_ack_backlog = 0;
972         sk->sk_state = SMC_LISTEN;
973         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
974         schedule_work(&smc->tcp_listen_work);
975
976 out:
977         release_sock(sk);
978         return rc;
979 }
980
981 static int smc_accept(struct socket *sock, struct socket *new_sock,
982                       int flags, bool kern)
983 {
984         struct sock *sk = sock->sk, *nsk;
985         DECLARE_WAITQUEUE(wait, current);
986         struct smc_sock *lsmc;
987         long timeo;
988         int rc = 0;
989
990         lsmc = smc_sk(sk);
991         lock_sock(sk);
992
993         if (lsmc->sk.sk_state != SMC_LISTEN) {
994                 rc = -EINVAL;
995                 goto out;
996         }
997
998         /* Wait for an incoming connection */
999         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1000         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1001         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1002                 set_current_state(TASK_INTERRUPTIBLE);
1003                 if (!timeo) {
1004                         rc = -EAGAIN;
1005                         break;
1006                 }
1007                 release_sock(sk);
1008                 timeo = schedule_timeout(timeo);
1009                 /* wakeup by sk_data_ready in smc_listen_work() */
1010                 sched_annotate_sleep();
1011                 lock_sock(sk);
1012                 if (signal_pending(current)) {
1013                         rc = sock_intr_errno(timeo);
1014                         break;
1015                 }
1016         }
1017         set_current_state(TASK_RUNNING);
1018         remove_wait_queue(sk_sleep(sk), &wait);
1019
1020         if (!rc)
1021                 rc = sock_error(nsk);
1022
1023 out:
1024         release_sock(sk);
1025         return rc;
1026 }
1027
1028 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1029                        int *len, int peer)
1030 {
1031         struct smc_sock *smc;
1032
1033         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1034             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1035                 return -ENOTCONN;
1036
1037         smc = smc_sk(sock->sk);
1038
1039         return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1040 }
1041
1042 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1043 {
1044         struct sock *sk = sock->sk;
1045         struct smc_sock *smc;
1046         int rc = -EPIPE;
1047
1048         smc = smc_sk(sk);
1049         lock_sock(sk);
1050         if ((sk->sk_state != SMC_ACTIVE) &&
1051             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1052             (sk->sk_state != SMC_INIT))
1053                 goto out;
1054         if (smc->use_fallback)
1055                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1056         else
1057                 rc = smc_tx_sendmsg(smc, msg, len);
1058 out:
1059         release_sock(sk);
1060         return rc;
1061 }
1062
1063 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1064                        int flags)
1065 {
1066         struct sock *sk = sock->sk;
1067         struct smc_sock *smc;
1068         int rc = -ENOTCONN;
1069
1070         smc = smc_sk(sk);
1071         lock_sock(sk);
1072         if ((sk->sk_state == SMC_INIT) ||
1073             (sk->sk_state == SMC_LISTEN) ||
1074             (sk->sk_state == SMC_CLOSED))
1075                 goto out;
1076
1077         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1078                 rc = 0;
1079                 goto out;
1080         }
1081
1082         if (smc->use_fallback)
1083                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1084         else
1085                 rc = smc_rx_recvmsg(smc, msg, len, flags);
1086
1087 out:
1088         release_sock(sk);
1089         return rc;
1090 }
1091
1092 static unsigned int smc_accept_poll(struct sock *parent)
1093 {
1094         struct smc_sock *isk;
1095         struct sock *sk;
1096
1097         lock_sock(parent);
1098         list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
1099                 sk = (struct sock *)isk;
1100
1101                 if (sk->sk_state == SMC_ACTIVE) {
1102                         release_sock(parent);
1103                         return POLLIN | POLLRDNORM;
1104                 }
1105         }
1106         release_sock(parent);
1107
1108         return 0;
1109 }
1110
1111 static unsigned int smc_poll(struct file *file, struct socket *sock,
1112                              poll_table *wait)
1113 {
1114         struct sock *sk = sock->sk;
1115         unsigned int mask = 0;
1116         struct smc_sock *smc;
1117         int rc;
1118
1119         smc = smc_sk(sock->sk);
1120         if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1121                 /* delegate to CLC child sock */
1122                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1123                 /* if non-blocking connect finished ... */
1124                 lock_sock(sk);
1125                 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
1126                         sk->sk_err = smc->clcsock->sk->sk_err;
1127                         if (sk->sk_err) {
1128                                 mask |= POLLERR;
1129                         } else {
1130                                 rc = smc_connect_rdma(smc);
1131                                 if (rc < 0)
1132                                         mask |= POLLERR;
1133                                 else
1134                                         /* success cases including fallback */
1135                                         mask |= POLLOUT | POLLWRNORM;
1136                         }
1137                 }
1138                 release_sock(sk);
1139         } else {
1140                 sock_poll_wait(file, sk_sleep(sk), wait);
1141                 if (sk->sk_state == SMC_LISTEN)
1142                         /* woken up by sk_data_ready in smc_listen_work() */
1143                         mask |= smc_accept_poll(sk);
1144                 if (sk->sk_err)
1145                         mask |= POLLERR;
1146                 if (atomic_read(&smc->conn.sndbuf_space) ||
1147                     (sk->sk_shutdown & SEND_SHUTDOWN)) {
1148                         mask |= POLLOUT | POLLWRNORM;
1149                 } else {
1150                         sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1151                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1152                 }
1153                 if (atomic_read(&smc->conn.bytes_to_rcv))
1154                         mask |= POLLIN | POLLRDNORM;
1155                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1156                     (sk->sk_state == SMC_CLOSED))
1157                         mask |= POLLHUP;
1158                 if (sk->sk_shutdown & RCV_SHUTDOWN)
1159                         mask |= POLLIN | POLLRDNORM | POLLRDHUP;
1160                 if (sk->sk_state == SMC_APPCLOSEWAIT1)
1161                         mask |= POLLIN;
1162
1163         }
1164
1165         return mask;
1166 }
1167
1168 static int smc_shutdown(struct socket *sock, int how)
1169 {
1170         struct sock *sk = sock->sk;
1171         struct smc_sock *smc;
1172         int rc = -EINVAL;
1173         int rc1 = 0;
1174
1175         smc = smc_sk(sk);
1176
1177         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1178                 return rc;
1179
1180         lock_sock(sk);
1181
1182         rc = -ENOTCONN;
1183         if ((sk->sk_state != SMC_LISTEN) &&
1184             (sk->sk_state != SMC_ACTIVE) &&
1185             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1186             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1187             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1188             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1189             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1190                 goto out;
1191         if (smc->use_fallback) {
1192                 rc = kernel_sock_shutdown(smc->clcsock, how);
1193                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1194                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1195                         sk->sk_state = SMC_CLOSED;
1196                 goto out;
1197         }
1198         switch (how) {
1199         case SHUT_RDWR:         /* shutdown in both directions */
1200                 rc = smc_close_active(smc);
1201                 break;
1202         case SHUT_WR:
1203                 rc = smc_close_shutdown_write(smc);
1204                 break;
1205         case SHUT_RD:
1206                 if (sk->sk_state == SMC_LISTEN)
1207                         rc = smc_close_active(smc);
1208                 else
1209                         rc = 0;
1210                         /* nothing more to do because peer is not involved */
1211                 break;
1212         }
1213         rc1 = kernel_sock_shutdown(smc->clcsock, how);
1214         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1215         sk->sk_shutdown |= how + 1;
1216
1217 out:
1218         release_sock(sk);
1219         return rc ? rc : rc1;
1220 }
1221
1222 static int smc_setsockopt(struct socket *sock, int level, int optname,
1223                           char __user *optval, unsigned int optlen)
1224 {
1225         struct sock *sk = sock->sk;
1226         struct smc_sock *smc;
1227
1228         smc = smc_sk(sk);
1229
1230         /* generic setsockopts reaching us here always apply to the
1231          * CLC socket
1232          */
1233         return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1234                                              optval, optlen);
1235 }
1236
1237 static int smc_getsockopt(struct socket *sock, int level, int optname,
1238                           char __user *optval, int __user *optlen)
1239 {
1240         struct smc_sock *smc;
1241
1242         smc = smc_sk(sock->sk);
1243         /* socket options apply to the CLC socket */
1244         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1245                                              optval, optlen);
1246 }
1247
1248 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1249                      unsigned long arg)
1250 {
1251         struct smc_sock *smc;
1252
1253         smc = smc_sk(sock->sk);
1254         if (smc->use_fallback)
1255                 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1256         else
1257                 return sock_no_ioctl(sock, cmd, arg);
1258 }
1259
1260 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1261                             int offset, size_t size, int flags)
1262 {
1263         struct sock *sk = sock->sk;
1264         struct smc_sock *smc;
1265         int rc = -EPIPE;
1266
1267         smc = smc_sk(sk);
1268         lock_sock(sk);
1269         if (sk->sk_state != SMC_ACTIVE)
1270                 goto out;
1271         if (smc->use_fallback)
1272                 rc = kernel_sendpage(smc->clcsock, page, offset,
1273                                      size, flags);
1274         else
1275                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1276
1277 out:
1278         release_sock(sk);
1279         return rc;
1280 }
1281
1282 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1283                                struct pipe_inode_info *pipe, size_t len,
1284                                     unsigned int flags)
1285 {
1286         struct sock *sk = sock->sk;
1287         struct smc_sock *smc;
1288         int rc = -ENOTCONN;
1289
1290         smc = smc_sk(sk);
1291         lock_sock(sk);
1292         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1293                 goto out;
1294         if (smc->use_fallback) {
1295                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1296                                                     pipe, len, flags);
1297         } else {
1298                 rc = -EOPNOTSUPP;
1299         }
1300 out:
1301         release_sock(sk);
1302         return rc;
1303 }
1304
1305 /* must look like tcp */
1306 static const struct proto_ops smc_sock_ops = {
1307         .family         = PF_SMC,
1308         .owner          = THIS_MODULE,
1309         .release        = smc_release,
1310         .bind           = smc_bind,
1311         .connect        = smc_connect,
1312         .socketpair     = sock_no_socketpair,
1313         .accept         = smc_accept,
1314         .getname        = smc_getname,
1315         .poll           = smc_poll,
1316         .ioctl          = smc_ioctl,
1317         .listen         = smc_listen,
1318         .shutdown       = smc_shutdown,
1319         .setsockopt     = smc_setsockopt,
1320         .getsockopt     = smc_getsockopt,
1321         .sendmsg        = smc_sendmsg,
1322         .recvmsg        = smc_recvmsg,
1323         .mmap           = sock_no_mmap,
1324         .sendpage       = smc_sendpage,
1325         .splice_read    = smc_splice_read,
1326 };
1327
1328 static int smc_create(struct net *net, struct socket *sock, int protocol,
1329                       int kern)
1330 {
1331         struct smc_sock *smc;
1332         struct sock *sk;
1333         int rc;
1334
1335         rc = -ESOCKTNOSUPPORT;
1336         if (sock->type != SOCK_STREAM)
1337                 goto out;
1338
1339         rc = -EPROTONOSUPPORT;
1340         if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1341                 goto out;
1342
1343         rc = -ENOBUFS;
1344         sock->ops = &smc_sock_ops;
1345         sk = smc_sock_alloc(net, sock);
1346         if (!sk)
1347                 goto out;
1348
1349         /* create internal TCP socket for CLC handshake and fallback */
1350         smc = smc_sk(sk);
1351         smc->use_fallback = false; /* assume rdma capability first */
1352         rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1353                               IPPROTO_TCP, &smc->clcsock);
1354         if (rc)
1355                 sk_common_release(sk);
1356         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1357         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1358
1359 out:
1360         return rc;
1361 }
1362
1363 static const struct net_proto_family smc_sock_family_ops = {
1364         .family = PF_SMC,
1365         .owner  = THIS_MODULE,
1366         .create = smc_create,
1367 };
1368
1369 static int __init smc_init(void)
1370 {
1371         int rc;
1372
1373         rc = smc_pnet_init();
1374         if (rc)
1375                 return rc;
1376
1377         rc = smc_llc_init();
1378         if (rc) {
1379                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1380                 goto out_pnet;
1381         }
1382
1383         rc = smc_cdc_init();
1384         if (rc) {
1385                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1386                 goto out_pnet;
1387         }
1388
1389         rc = proto_register(&smc_proto, 1);
1390         if (rc) {
1391                 pr_err("%s: proto_register fails with %d\n", __func__, rc);
1392                 goto out_pnet;
1393         }
1394
1395         rc = sock_register(&smc_sock_family_ops);
1396         if (rc) {
1397                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1398                 goto out_proto;
1399         }
1400         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1401
1402         rc = smc_ib_register_client();
1403         if (rc) {
1404                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1405                 goto out_sock;
1406         }
1407
1408         return 0;
1409
1410 out_sock:
1411         sock_unregister(PF_SMC);
1412 out_proto:
1413         proto_unregister(&smc_proto);
1414 out_pnet:
1415         smc_pnet_exit();
1416         return rc;
1417 }
1418
1419 static void __exit smc_exit(void)
1420 {
1421         struct smc_link_group *lgr, *lg;
1422         LIST_HEAD(lgr_freeing_list);
1423
1424         spin_lock_bh(&smc_lgr_list.lock);
1425         if (!list_empty(&smc_lgr_list.list))
1426                 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1427         spin_unlock_bh(&smc_lgr_list.lock);
1428         list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1429                 list_del_init(&lgr->list);
1430                 smc_lgr_free(lgr); /* free link group */
1431         }
1432         smc_ib_unregister_client();
1433         sock_unregister(PF_SMC);
1434         proto_unregister(&smc_proto);
1435         smc_pnet_exit();
1436 }
1437
1438 module_init(smc_init);
1439 module_exit(smc_exit);
1440
1441 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1442 MODULE_DESCRIPTION("smc socket address family");
1443 MODULE_LICENSE("GPL");
1444 MODULE_ALIAS_NETPROTO(PF_SMC);