Revert "net/smc: Replace ib_query_gid with rdma_get_gid_attr"
[linux-2.6-block.git] / net / smc / smc_core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  Basic Transport Functions exploiting Infiniband API
6  *
7  *  Copyright IBM Corp. 2016
8  *
9  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
10  */
11
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <net/tcp.h>
17 #include <net/sock.h>
18 #include <rdma/ib_verbs.h>
19 #include <rdma/ib_cache.h>
20
21 #include "smc.h"
22 #include "smc_clc.h"
23 #include "smc_core.h"
24 #include "smc_ib.h"
25 #include "smc_wr.h"
26 #include "smc_llc.h"
27 #include "smc_cdc.h"
28 #include "smc_close.h"
29
30 #define SMC_LGR_NUM_INCR                256
31 #define SMC_LGR_FREE_DELAY_SERV         (600 * HZ)
32 #define SMC_LGR_FREE_DELAY_CLNT         (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
33
34 static struct smc_lgr_list smc_lgr_list = {     /* established link groups */
35         .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
36         .list = LIST_HEAD_INIT(smc_lgr_list.list),
37         .num = 0,
38 };
39
40 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
41                          struct smc_buf_desc *buf_desc);
42
43 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
44 {
45         /* client link group creation always follows the server link group
46          * creation. For client use a somewhat higher removal delay time,
47          * otherwise there is a risk of out-of-sync link groups.
48          */
49         mod_delayed_work(system_wq, &lgr->free_work,
50                          lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
51                                                  SMC_LGR_FREE_DELAY_SERV);
52 }
53
54 /* Register connection's alert token in our lookup structure.
55  * To use rbtrees we have to implement our own insert core.
56  * Requires @conns_lock
57  * @smc         connection to register
58  * Returns 0 on success, != otherwise.
59  */
60 static void smc_lgr_add_alert_token(struct smc_connection *conn)
61 {
62         struct rb_node **link, *parent = NULL;
63         u32 token = conn->alert_token_local;
64
65         link = &conn->lgr->conns_all.rb_node;
66         while (*link) {
67                 struct smc_connection *cur = rb_entry(*link,
68                                         struct smc_connection, alert_node);
69
70                 parent = *link;
71                 if (cur->alert_token_local > token)
72                         link = &parent->rb_left;
73                 else
74                         link = &parent->rb_right;
75         }
76         /* Put the new node there */
77         rb_link_node(&conn->alert_node, parent, link);
78         rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
79 }
80
81 /* Register connection in link group by assigning an alert token
82  * registered in a search tree.
83  * Requires @conns_lock
84  * Note that '0' is a reserved value and not assigned.
85  */
86 static void smc_lgr_register_conn(struct smc_connection *conn)
87 {
88         struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
89         static atomic_t nexttoken = ATOMIC_INIT(0);
90
91         /* find a new alert_token_local value not yet used by some connection
92          * in this link group
93          */
94         sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
95         while (!conn->alert_token_local) {
96                 conn->alert_token_local = atomic_inc_return(&nexttoken);
97                 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
98                         conn->alert_token_local = 0;
99         }
100         smc_lgr_add_alert_token(conn);
101         conn->lgr->conns_num++;
102 }
103
104 /* Unregister connection and reset the alert token of the given connection<
105  */
106 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
107 {
108         struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
109         struct smc_link_group *lgr = conn->lgr;
110
111         rb_erase(&conn->alert_node, &lgr->conns_all);
112         lgr->conns_num--;
113         conn->alert_token_local = 0;
114         conn->lgr = NULL;
115         sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
116 }
117
118 /* Unregister connection and trigger lgr freeing if applicable
119  */
120 static void smc_lgr_unregister_conn(struct smc_connection *conn)
121 {
122         struct smc_link_group *lgr = conn->lgr;
123         int reduced = 0;
124
125         write_lock_bh(&lgr->conns_lock);
126         if (conn->alert_token_local) {
127                 reduced = 1;
128                 __smc_lgr_unregister_conn(conn);
129         }
130         write_unlock_bh(&lgr->conns_lock);
131         if (!reduced || lgr->conns_num)
132                 return;
133         smc_lgr_schedule_free_work(lgr);
134 }
135
136 static void smc_lgr_free_work(struct work_struct *work)
137 {
138         struct smc_link_group *lgr = container_of(to_delayed_work(work),
139                                                   struct smc_link_group,
140                                                   free_work);
141         bool conns;
142
143         spin_lock_bh(&smc_lgr_list.lock);
144         if (list_empty(&lgr->list))
145                 goto free;
146         read_lock_bh(&lgr->conns_lock);
147         conns = RB_EMPTY_ROOT(&lgr->conns_all);
148         read_unlock_bh(&lgr->conns_lock);
149         if (!conns) { /* number of lgr connections is no longer zero */
150                 spin_unlock_bh(&smc_lgr_list.lock);
151                 return;
152         }
153         list_del_init(&lgr->list); /* remove from smc_lgr_list */
154 free:
155         spin_unlock_bh(&smc_lgr_list.lock);
156         if (!delayed_work_pending(&lgr->free_work)) {
157                 if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
158                         smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
159                 smc_lgr_free(lgr);
160         }
161 }
162
163 /* create a new SMC link group */
164 static int smc_lgr_create(struct smc_sock *smc,
165                           struct smc_ib_device *smcibdev, u8 ibport,
166                           char *peer_systemid, unsigned short vlan_id)
167 {
168         struct smc_link_group *lgr;
169         struct smc_link *lnk;
170         u8 rndvec[3];
171         int rc = 0;
172         int i;
173
174         lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
175         if (!lgr) {
176                 rc = -ENOMEM;
177                 goto out;
178         }
179         lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
180         lgr->sync_err = 0;
181         memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
182         lgr->vlan_id = vlan_id;
183         rwlock_init(&lgr->sndbufs_lock);
184         rwlock_init(&lgr->rmbs_lock);
185         for (i = 0; i < SMC_RMBE_SIZES; i++) {
186                 INIT_LIST_HEAD(&lgr->sndbufs[i]);
187                 INIT_LIST_HEAD(&lgr->rmbs[i]);
188         }
189         smc_lgr_list.num += SMC_LGR_NUM_INCR;
190         memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
191         INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
192         lgr->conns_all = RB_ROOT;
193
194         lnk = &lgr->lnk[SMC_SINGLE_LINK];
195         /* initialize link */
196         lnk->state = SMC_LNK_ACTIVATING;
197         lnk->link_id = SMC_SINGLE_LINK;
198         lnk->smcibdev = smcibdev;
199         lnk->ibport = ibport;
200         lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
201         if (!smcibdev->initialized)
202                 smc_ib_setup_per_ibdev(smcibdev);
203         get_random_bytes(rndvec, sizeof(rndvec));
204         lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
205         rc = smc_llc_link_init(lnk);
206         if (rc)
207                 goto free_lgr;
208         rc = smc_wr_alloc_link_mem(lnk);
209         if (rc)
210                 goto clear_llc_lnk;
211         rc = smc_ib_create_protection_domain(lnk);
212         if (rc)
213                 goto free_link_mem;
214         rc = smc_ib_create_queue_pair(lnk);
215         if (rc)
216                 goto dealloc_pd;
217         rc = smc_wr_create_link(lnk);
218         if (rc)
219                 goto destroy_qp;
220
221         smc->conn.lgr = lgr;
222         rwlock_init(&lgr->conns_lock);
223         spin_lock_bh(&smc_lgr_list.lock);
224         list_add(&lgr->list, &smc_lgr_list.list);
225         spin_unlock_bh(&smc_lgr_list.lock);
226         return 0;
227
228 destroy_qp:
229         smc_ib_destroy_queue_pair(lnk);
230 dealloc_pd:
231         smc_ib_dealloc_protection_domain(lnk);
232 free_link_mem:
233         smc_wr_free_link_mem(lnk);
234 clear_llc_lnk:
235         smc_llc_link_clear(lnk);
236 free_lgr:
237         kfree(lgr);
238 out:
239         return rc;
240 }
241
242 static void smc_buf_unuse(struct smc_connection *conn)
243 {
244         if (conn->sndbuf_desc)
245                 conn->sndbuf_desc->used = 0;
246         if (conn->rmb_desc) {
247                 if (!conn->rmb_desc->regerr) {
248                         conn->rmb_desc->reused = 1;
249                         conn->rmb_desc->used = 0;
250                 } else {
251                         /* buf registration failed, reuse not possible */
252                         struct smc_link_group *lgr = conn->lgr;
253
254                         write_lock_bh(&lgr->rmbs_lock);
255                         list_del(&conn->rmb_desc->list);
256                         write_unlock_bh(&lgr->rmbs_lock);
257
258                         smc_buf_free(lgr, true, conn->rmb_desc);
259                 }
260         }
261 }
262
263 /* remove a finished connection from its link group */
264 void smc_conn_free(struct smc_connection *conn)
265 {
266         if (!conn->lgr)
267                 return;
268         smc_cdc_tx_dismiss_slots(conn);
269         smc_lgr_unregister_conn(conn);
270         smc_buf_unuse(conn);
271 }
272
273 static void smc_link_clear(struct smc_link *lnk)
274 {
275         lnk->peer_qpn = 0;
276         smc_llc_link_clear(lnk);
277         smc_ib_modify_qp_reset(lnk);
278         smc_wr_free_link(lnk);
279         smc_ib_destroy_queue_pair(lnk);
280         smc_ib_dealloc_protection_domain(lnk);
281         smc_wr_free_link_mem(lnk);
282 }
283
284 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
285                          struct smc_buf_desc *buf_desc)
286 {
287         struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
288
289         if (is_rmb) {
290                 if (buf_desc->mr_rx[SMC_SINGLE_LINK])
291                         smc_ib_put_memory_region(
292                                         buf_desc->mr_rx[SMC_SINGLE_LINK]);
293                 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
294                                     DMA_FROM_DEVICE);
295         } else {
296                 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
297                                     DMA_TO_DEVICE);
298         }
299         sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
300         if (buf_desc->pages)
301                 __free_pages(buf_desc->pages, buf_desc->order);
302         kfree(buf_desc);
303 }
304
305 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
306 {
307         struct smc_buf_desc *buf_desc, *bf_desc;
308         struct list_head *buf_list;
309         int i;
310
311         for (i = 0; i < SMC_RMBE_SIZES; i++) {
312                 if (is_rmb)
313                         buf_list = &lgr->rmbs[i];
314                 else
315                         buf_list = &lgr->sndbufs[i];
316                 list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
317                                          list) {
318                         list_del(&buf_desc->list);
319                         smc_buf_free(lgr, is_rmb, buf_desc);
320                 }
321         }
322 }
323
324 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
325 {
326         /* free send buffers */
327         __smc_lgr_free_bufs(lgr, false);
328         /* free rmbs */
329         __smc_lgr_free_bufs(lgr, true);
330 }
331
332 /* remove a link group */
333 void smc_lgr_free(struct smc_link_group *lgr)
334 {
335         smc_lgr_free_bufs(lgr);
336         smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
337         kfree(lgr);
338 }
339
340 void smc_lgr_forget(struct smc_link_group *lgr)
341 {
342         spin_lock_bh(&smc_lgr_list.lock);
343         /* do not use this link group for new connections */
344         if (!list_empty(&lgr->list))
345                 list_del_init(&lgr->list);
346         spin_unlock_bh(&smc_lgr_list.lock);
347 }
348
349 /* terminate linkgroup abnormally */
350 static void __smc_lgr_terminate(struct smc_link_group *lgr)
351 {
352         struct smc_connection *conn;
353         struct smc_sock *smc;
354         struct rb_node *node;
355
356         if (lgr->terminating)
357                 return; /* lgr already terminating */
358         lgr->terminating = 1;
359         if (!list_empty(&lgr->list)) /* forget lgr */
360                 list_del_init(&lgr->list);
361         smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
362
363         write_lock_bh(&lgr->conns_lock);
364         node = rb_first(&lgr->conns_all);
365         while (node) {
366                 conn = rb_entry(node, struct smc_connection, alert_node);
367                 smc = container_of(conn, struct smc_sock, conn);
368                 sock_hold(&smc->sk); /* sock_put in close work */
369                 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
370                 __smc_lgr_unregister_conn(conn);
371                 write_unlock_bh(&lgr->conns_lock);
372                 if (!schedule_work(&conn->close_work))
373                         sock_put(&smc->sk);
374                 write_lock_bh(&lgr->conns_lock);
375                 node = rb_first(&lgr->conns_all);
376         }
377         write_unlock_bh(&lgr->conns_lock);
378         wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
379         smc_lgr_schedule_free_work(lgr);
380 }
381
382 void smc_lgr_terminate(struct smc_link_group *lgr)
383 {
384         spin_lock_bh(&smc_lgr_list.lock);
385         __smc_lgr_terminate(lgr);
386         spin_unlock_bh(&smc_lgr_list.lock);
387 }
388
389 /* Called when IB port is terminated */
390 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
391 {
392         struct smc_link_group *lgr, *l;
393
394         spin_lock_bh(&smc_lgr_list.lock);
395         list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
396                 if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
397                     lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
398                         __smc_lgr_terminate(lgr);
399         }
400         spin_unlock_bh(&smc_lgr_list.lock);
401 }
402
403 /* Determine vlan of internal TCP socket.
404  * @vlan_id: address to store the determined vlan id into
405  */
406 static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
407 {
408         struct dst_entry *dst = sk_dst_get(clcsock->sk);
409         struct net_device *ndev;
410         int i, nest_lvl, rc = 0;
411
412         *vlan_id = 0;
413         if (!dst) {
414                 rc = -ENOTCONN;
415                 goto out;
416         }
417         if (!dst->dev) {
418                 rc = -ENODEV;
419                 goto out_rel;
420         }
421
422         ndev = dst->dev;
423         if (is_vlan_dev(ndev)) {
424                 *vlan_id = vlan_dev_vlan_id(ndev);
425                 goto out_rel;
426         }
427
428         rtnl_lock();
429         nest_lvl = dev_get_nest_level(ndev);
430         for (i = 0; i < nest_lvl; i++) {
431                 struct list_head *lower = &ndev->adj_list.lower;
432
433                 if (list_empty(lower))
434                         break;
435                 lower = lower->next;
436                 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
437                 if (is_vlan_dev(ndev)) {
438                         *vlan_id = vlan_dev_vlan_id(ndev);
439                         break;
440                 }
441         }
442         rtnl_unlock();
443
444 out_rel:
445         dst_release(dst);
446 out:
447         return rc;
448 }
449
450 /* determine the link gid matching the vlan id of the link group */
451 static int smc_link_determine_gid(struct smc_link_group *lgr)
452 {
453         struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
454         struct ib_gid_attr gattr;
455         union ib_gid gid;
456         int i;
457
458         if (!lgr->vlan_id) {
459                 lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
460                 return 0;
461         }
462
463         for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
464              i++) {
465                 if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
466                                  &gattr))
467                         continue;
468                 if (gattr.ndev) {
469                         if (is_vlan_dev(gattr.ndev) &&
470                             vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
471                                 lnk->gid = gid;
472                                 dev_put(gattr.ndev);
473                                 return 0;
474                         }
475                         dev_put(gattr.ndev);
476                 }
477         }
478         return -ENODEV;
479 }
480
481 /* create a new SMC connection (and a new link group if necessary) */
482 int smc_conn_create(struct smc_sock *smc,
483                     struct smc_ib_device *smcibdev, u8 ibport,
484                     struct smc_clc_msg_local *lcl, int srv_first_contact)
485 {
486         struct smc_connection *conn = &smc->conn;
487         int local_contact = SMC_FIRST_CONTACT;
488         struct smc_link_group *lgr;
489         unsigned short vlan_id;
490         enum smc_lgr_role role;
491         int rc = 0;
492
493         role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
494         rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
495         if (rc)
496                 return rc;
497
498         if ((role == SMC_CLNT) && srv_first_contact)
499                 /* create new link group as well */
500                 goto create;
501
502         /* determine if an existing link group can be reused */
503         spin_lock_bh(&smc_lgr_list.lock);
504         list_for_each_entry(lgr, &smc_lgr_list.list, list) {
505                 write_lock_bh(&lgr->conns_lock);
506                 if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
507                             SMC_SYSTEMID_LEN) &&
508                     !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
509                             SMC_GID_SIZE) &&
510                     !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
511                             sizeof(lcl->mac)) &&
512                     !lgr->sync_err &&
513                     (lgr->role == role) &&
514                     (lgr->vlan_id == vlan_id) &&
515                     ((role == SMC_CLNT) ||
516                      (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
517                         /* link group found */
518                         local_contact = SMC_REUSE_CONTACT;
519                         conn->lgr = lgr;
520                         smc_lgr_register_conn(conn); /* add smc conn to lgr */
521                         write_unlock_bh(&lgr->conns_lock);
522                         break;
523                 }
524                 write_unlock_bh(&lgr->conns_lock);
525         }
526         spin_unlock_bh(&smc_lgr_list.lock);
527
528         if (role == SMC_CLNT && !srv_first_contact &&
529             (local_contact == SMC_FIRST_CONTACT)) {
530                 /* Server reuses a link group, but Client wants to start
531                  * a new one
532                  * send out_of_sync decline, reason synchr. error
533                  */
534                 return -ENOLINK;
535         }
536
537 create:
538         if (local_contact == SMC_FIRST_CONTACT) {
539                 rc = smc_lgr_create(smc, smcibdev, ibport,
540                                     lcl->id_for_peer, vlan_id);
541                 if (rc)
542                         goto out;
543                 smc_lgr_register_conn(conn); /* add smc conn to lgr */
544                 rc = smc_link_determine_gid(conn->lgr);
545         }
546         conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
547         conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
548         conn->urg_state = SMC_URG_READ;
549 #ifndef KERNEL_HAS_ATOMIC64
550         spin_lock_init(&conn->acurs_lock);
551 #endif
552
553 out:
554         return rc ? rc : local_contact;
555 }
556
557 /* convert the RMB size into the compressed notation - minimum 16K.
558  * In contrast to plain ilog2, this rounds towards the next power of 2,
559  * so the socket application gets at least its desired sndbuf / rcvbuf size.
560  */
561 static u8 smc_compress_bufsize(int size)
562 {
563         u8 compressed;
564
565         if (size <= SMC_BUF_MIN_SIZE)
566                 return 0;
567
568         size = (size - 1) >> 14;
569         compressed = ilog2(size) + 1;
570         if (compressed >= SMC_RMBE_SIZES)
571                 compressed = SMC_RMBE_SIZES - 1;
572         return compressed;
573 }
574
575 /* convert the RMB size from compressed notation into integer */
576 int smc_uncompress_bufsize(u8 compressed)
577 {
578         u32 size;
579
580         size = 0x00000001 << (((int)compressed) + 14);
581         return (int)size;
582 }
583
584 /* try to reuse a sndbuf or rmb description slot for a certain
585  * buffer size; if not available, return NULL
586  */
587 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
588                                              rwlock_t *lock,
589                                              struct list_head *buf_list)
590 {
591         struct smc_buf_desc *buf_slot;
592
593         read_lock_bh(lock);
594         list_for_each_entry(buf_slot, buf_list, list) {
595                 if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
596                         read_unlock_bh(lock);
597                         return buf_slot;
598                 }
599         }
600         read_unlock_bh(lock);
601         return NULL;
602 }
603
604 /* one of the conditions for announcing a receiver's current window size is
605  * that it "results in a minimum increase in the window size of 10% of the
606  * receive buffer space" [RFC7609]
607  */
608 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
609 {
610         return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
611 }
612
613 static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
614                                                bool is_rmb, int bufsize)
615 {
616         struct smc_buf_desc *buf_desc;
617         struct smc_link *lnk;
618         int rc;
619
620         /* try to alloc a new buffer */
621         buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
622         if (!buf_desc)
623                 return ERR_PTR(-ENOMEM);
624
625         buf_desc->order = get_order(bufsize);
626         buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
627                                       __GFP_NOMEMALLOC | __GFP_COMP |
628                                       __GFP_NORETRY | __GFP_ZERO,
629                                       buf_desc->order);
630         if (!buf_desc->pages) {
631                 kfree(buf_desc);
632                 return ERR_PTR(-EAGAIN);
633         }
634         buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
635
636         /* build the sg table from the pages */
637         lnk = &lgr->lnk[SMC_SINGLE_LINK];
638         rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
639                             GFP_KERNEL);
640         if (rc) {
641                 smc_buf_free(lgr, is_rmb, buf_desc);
642                 return ERR_PTR(rc);
643         }
644         sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
645                    buf_desc->cpu_addr, bufsize);
646
647         /* map sg table to DMA address */
648         rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
649                                is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
650         /* SMC protocol depends on mapping to one DMA address only */
651         if (rc != 1)  {
652                 smc_buf_free(lgr, is_rmb, buf_desc);
653                 return ERR_PTR(-EAGAIN);
654         }
655
656         /* create a new memory region for the RMB */
657         if (is_rmb) {
658                 rc = smc_ib_get_memory_region(lnk->roce_pd,
659                                               IB_ACCESS_REMOTE_WRITE |
660                                               IB_ACCESS_LOCAL_WRITE,
661                                               buf_desc);
662                 if (rc) {
663                         smc_buf_free(lgr, is_rmb, buf_desc);
664                         return ERR_PTR(rc);
665                 }
666         }
667
668         buf_desc->len = bufsize;
669         return buf_desc;
670 }
671
672 static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
673 {
674         struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
675         struct smc_connection *conn = &smc->conn;
676         struct smc_link_group *lgr = conn->lgr;
677         struct list_head *buf_list;
678         int bufsize, bufsize_short;
679         int sk_buf_size;
680         rwlock_t *lock;
681
682         if (is_rmb)
683                 /* use socket recv buffer size (w/o overhead) as start value */
684                 sk_buf_size = smc->sk.sk_rcvbuf / 2;
685         else
686                 /* use socket send buffer size (w/o overhead) as start value */
687                 sk_buf_size = smc->sk.sk_sndbuf / 2;
688
689         for (bufsize_short = smc_compress_bufsize(sk_buf_size);
690              bufsize_short >= 0; bufsize_short--) {
691
692                 if (is_rmb) {
693                         lock = &lgr->rmbs_lock;
694                         buf_list = &lgr->rmbs[bufsize_short];
695                 } else {
696                         lock = &lgr->sndbufs_lock;
697                         buf_list = &lgr->sndbufs[bufsize_short];
698                 }
699                 bufsize = smc_uncompress_bufsize(bufsize_short);
700                 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
701                         continue;
702
703                 /* check for reusable slot in the link group */
704                 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
705                 if (buf_desc) {
706                         memset(buf_desc->cpu_addr, 0, bufsize);
707                         break; /* found reusable slot */
708                 }
709
710                 buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize);
711                 if (PTR_ERR(buf_desc) == -ENOMEM)
712                         break;
713                 if (IS_ERR(buf_desc))
714                         continue;
715
716                 buf_desc->used = 1;
717                 write_lock_bh(lock);
718                 list_add(&buf_desc->list, buf_list);
719                 write_unlock_bh(lock);
720                 break; /* found */
721         }
722
723         if (IS_ERR(buf_desc))
724                 return -ENOMEM;
725
726         if (is_rmb) {
727                 conn->rmb_desc = buf_desc;
728                 conn->rmbe_size_short = bufsize_short;
729                 smc->sk.sk_rcvbuf = bufsize * 2;
730                 atomic_set(&conn->bytes_to_rcv, 0);
731                 conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
732         } else {
733                 conn->sndbuf_desc = buf_desc;
734                 smc->sk.sk_sndbuf = bufsize * 2;
735                 atomic_set(&conn->sndbuf_space, bufsize);
736         }
737         return 0;
738 }
739
740 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
741 {
742         struct smc_link_group *lgr = conn->lgr;
743
744         smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
745                                conn->sndbuf_desc, DMA_TO_DEVICE);
746 }
747
748 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
749 {
750         struct smc_link_group *lgr = conn->lgr;
751
752         smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
753                                   conn->sndbuf_desc, DMA_TO_DEVICE);
754 }
755
756 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
757 {
758         struct smc_link_group *lgr = conn->lgr;
759
760         smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
761                                conn->rmb_desc, DMA_FROM_DEVICE);
762 }
763
764 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
765 {
766         struct smc_link_group *lgr = conn->lgr;
767
768         smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
769                                   conn->rmb_desc, DMA_FROM_DEVICE);
770 }
771
772 /* create the send and receive buffer for an SMC socket;
773  * receive buffers are called RMBs;
774  * (even though the SMC protocol allows more than one RMB-element per RMB,
775  * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
776  * extra RMB for every connection in a link group
777  */
778 int smc_buf_create(struct smc_sock *smc)
779 {
780         int rc;
781
782         /* create send buffer */
783         rc = __smc_buf_create(smc, false);
784         if (rc)
785                 return rc;
786         /* create rmb */
787         rc = __smc_buf_create(smc, true);
788         if (rc)
789                 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
790         return rc;
791 }
792
793 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
794 {
795         int i;
796
797         for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
798                 if (!test_and_set_bit(i, lgr->rtokens_used_mask))
799                         return i;
800         }
801         return -ENOSPC;
802 }
803
804 /* add a new rtoken from peer */
805 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
806 {
807         u64 dma_addr = be64_to_cpu(nw_vaddr);
808         u32 rkey = ntohl(nw_rkey);
809         int i;
810
811         for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
812                 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
813                     (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
814                     test_bit(i, lgr->rtokens_used_mask)) {
815                         /* already in list */
816                         return i;
817                 }
818         }
819         i = smc_rmb_reserve_rtoken_idx(lgr);
820         if (i < 0)
821                 return i;
822         lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
823         lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
824         return i;
825 }
826
827 /* delete an rtoken */
828 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
829 {
830         u32 rkey = ntohl(nw_rkey);
831         int i;
832
833         for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
834                 if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
835                     test_bit(i, lgr->rtokens_used_mask)) {
836                         lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
837                         lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
838
839                         clear_bit(i, lgr->rtokens_used_mask);
840                         return 0;
841                 }
842         }
843         return -ENOENT;
844 }
845
846 /* save rkey and dma_addr received from peer during clc handshake */
847 int smc_rmb_rtoken_handling(struct smc_connection *conn,
848                             struct smc_clc_msg_accept_confirm *clc)
849 {
850         conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
851                                           clc->rmb_rkey);
852         if (conn->rtoken_idx < 0)
853                 return conn->rtoken_idx;
854         return 0;
855 }
856
857 /* Called (from smc_exit) when module is removed */
858 void smc_core_exit(void)
859 {
860         struct smc_link_group *lgr, *lg;
861         LIST_HEAD(lgr_freeing_list);
862
863         spin_lock_bh(&smc_lgr_list.lock);
864         if (!list_empty(&smc_lgr_list.list))
865                 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
866         spin_unlock_bh(&smc_lgr_list.lock);
867         list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
868                 list_del_init(&lgr->list);
869                 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
870                 cancel_delayed_work_sync(&lgr->free_work);
871                 smc_lgr_free(lgr); /* free link group */
872         }
873 }