75edb080435851eeee93d1181d7c10d5227d4ac9
[linux-2.6-block.git] / drivers / infiniband / hw / mlx5 / main.c
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/module.h>
36 #include <linux/init.h>
37 #include <linux/errno.h>
38 #include <linux/pci.h>
39 #include <linux/dma-mapping.h>
40 #include <linux/slab.h>
41 #include <linux/bitmap.h>
42 #if defined(CONFIG_X86)
43 #include <asm/pat.h>
44 #endif
45 #include <linux/sched.h>
46 #include <linux/sched/mm.h>
47 #include <linux/sched/task.h>
48 #include <linux/delay.h>
49 #include <rdma/ib_user_verbs.h>
50 #include <rdma/ib_addr.h>
51 #include <rdma/ib_cache.h>
52 #include <linux/mlx5/port.h>
53 #include <linux/mlx5/vport.h>
54 #include <linux/mlx5/fs.h>
55 #include <linux/list.h>
56 #include <rdma/ib_smi.h>
57 #include <rdma/ib_umem.h>
58 #include <linux/in.h>
59 #include <linux/etherdevice.h>
60 #include "mlx5_ib.h"
61 #include "ib_rep.h"
62 #include "cmd.h"
63 #include "srq.h"
64 #include <linux/mlx5/fs_helpers.h>
65 #include <linux/mlx5/accel.h>
66 #include <rdma/uverbs_std_types.h>
67 #include <rdma/mlx5_user_ioctl_verbs.h>
68 #include <rdma/mlx5_user_ioctl_cmds.h>
69
70 #define UVERBS_MODULE_NAME mlx5_ib
71 #include <rdma/uverbs_named_ioctl.h>
72
73 #define DRIVER_NAME "mlx5_ib"
74 #define DRIVER_VERSION "5.0-0"
75
76 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
77 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
78 MODULE_LICENSE("Dual BSD/GPL");
79
80 static char mlx5_version[] =
81         DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
82         DRIVER_VERSION "\n";
83
84 struct mlx5_ib_event_work {
85         struct work_struct      work;
86         union {
87                 struct mlx5_ib_dev            *dev;
88                 struct mlx5_ib_multiport_info *mpi;
89         };
90         bool                    is_slave;
91         unsigned int            event;
92         void                    *param;
93 };
94
95 enum {
96         MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
97 };
98
99 static struct workqueue_struct *mlx5_ib_event_wq;
100 static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
101 static LIST_HEAD(mlx5_ib_dev_list);
102 /*
103  * This mutex should be held when accessing either of the above lists
104  */
105 static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
106
107 /* We can't use an array for xlt_emergency_page because dma_map_single
108  * doesn't work on kernel modules memory
109  */
110 static unsigned long xlt_emergency_page;
111 static struct mutex xlt_emergency_page_mutex;
112
113 struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
114 {
115         struct mlx5_ib_dev *dev;
116
117         mutex_lock(&mlx5_ib_multiport_mutex);
118         dev = mpi->ibdev;
119         mutex_unlock(&mlx5_ib_multiport_mutex);
120         return dev;
121 }
122
123 static enum rdma_link_layer
124 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
125 {
126         switch (port_type_cap) {
127         case MLX5_CAP_PORT_TYPE_IB:
128                 return IB_LINK_LAYER_INFINIBAND;
129         case MLX5_CAP_PORT_TYPE_ETH:
130                 return IB_LINK_LAYER_ETHERNET;
131         default:
132                 return IB_LINK_LAYER_UNSPECIFIED;
133         }
134 }
135
136 static enum rdma_link_layer
137 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
138 {
139         struct mlx5_ib_dev *dev = to_mdev(device);
140         int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
141
142         return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
143 }
144
145 static int get_port_state(struct ib_device *ibdev,
146                           u8 port_num,
147                           enum ib_port_state *state)
148 {
149         struct ib_port_attr attr;
150         int ret;
151
152         memset(&attr, 0, sizeof(attr));
153         ret = ibdev->ops.query_port(ibdev, port_num, &attr);
154         if (!ret)
155                 *state = attr.state;
156         return ret;
157 }
158
159 static int mlx5_netdev_event(struct notifier_block *this,
160                              unsigned long event, void *ptr)
161 {
162         struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
163         struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
164         u8 port_num = roce->native_port_num;
165         struct mlx5_core_dev *mdev;
166         struct mlx5_ib_dev *ibdev;
167
168         ibdev = roce->dev;
169         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
170         if (!mdev)
171                 return NOTIFY_DONE;
172
173         switch (event) {
174         case NETDEV_REGISTER:
175                 write_lock(&roce->netdev_lock);
176                 if (ibdev->rep) {
177                         struct mlx5_eswitch *esw = ibdev->mdev->priv.eswitch;
178                         struct net_device *rep_ndev;
179
180                         rep_ndev = mlx5_ib_get_rep_netdev(esw,
181                                                           ibdev->rep->vport);
182                         if (rep_ndev == ndev)
183                                 roce->netdev = ndev;
184                 } else if (ndev->dev.parent == &mdev->pdev->dev) {
185                         roce->netdev = ndev;
186                 }
187                 write_unlock(&roce->netdev_lock);
188                 break;
189
190         case NETDEV_UNREGISTER:
191                 write_lock(&roce->netdev_lock);
192                 if (roce->netdev == ndev)
193                         roce->netdev = NULL;
194                 write_unlock(&roce->netdev_lock);
195                 break;
196
197         case NETDEV_CHANGE:
198         case NETDEV_UP:
199         case NETDEV_DOWN: {
200                 struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
201                 struct net_device *upper = NULL;
202
203                 if (lag_ndev) {
204                         upper = netdev_master_upper_dev_get(lag_ndev);
205                         dev_put(lag_ndev);
206                 }
207
208                 if ((upper == ndev || (!upper && ndev == roce->netdev))
209                     && ibdev->ib_active) {
210                         struct ib_event ibev = { };
211                         enum ib_port_state port_state;
212
213                         if (get_port_state(&ibdev->ib_dev, port_num,
214                                            &port_state))
215                                 goto done;
216
217                         if (roce->last_port_state == port_state)
218                                 goto done;
219
220                         roce->last_port_state = port_state;
221                         ibev.device = &ibdev->ib_dev;
222                         if (port_state == IB_PORT_DOWN)
223                                 ibev.event = IB_EVENT_PORT_ERR;
224                         else if (port_state == IB_PORT_ACTIVE)
225                                 ibev.event = IB_EVENT_PORT_ACTIVE;
226                         else
227                                 goto done;
228
229                         ibev.element.port_num = port_num;
230                         ib_dispatch_event(&ibev);
231                 }
232                 break;
233         }
234
235         default:
236                 break;
237         }
238 done:
239         mlx5_ib_put_native_port_mdev(ibdev, port_num);
240         return NOTIFY_DONE;
241 }
242
243 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
244                                              u8 port_num)
245 {
246         struct mlx5_ib_dev *ibdev = to_mdev(device);
247         struct net_device *ndev;
248         struct mlx5_core_dev *mdev;
249
250         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
251         if (!mdev)
252                 return NULL;
253
254         ndev = mlx5_lag_get_roce_netdev(mdev);
255         if (ndev)
256                 goto out;
257
258         /* Ensure ndev does not disappear before we invoke dev_hold()
259          */
260         read_lock(&ibdev->roce[port_num - 1].netdev_lock);
261         ndev = ibdev->roce[port_num - 1].netdev;
262         if (ndev)
263                 dev_hold(ndev);
264         read_unlock(&ibdev->roce[port_num - 1].netdev_lock);
265
266 out:
267         mlx5_ib_put_native_port_mdev(ibdev, port_num);
268         return ndev;
269 }
270
271 struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
272                                                    u8 ib_port_num,
273                                                    u8 *native_port_num)
274 {
275         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
276                                                           ib_port_num);
277         struct mlx5_core_dev *mdev = NULL;
278         struct mlx5_ib_multiport_info *mpi;
279         struct mlx5_ib_port *port;
280
281         if (!mlx5_core_mp_enabled(ibdev->mdev) ||
282             ll != IB_LINK_LAYER_ETHERNET) {
283                 if (native_port_num)
284                         *native_port_num = ib_port_num;
285                 return ibdev->mdev;
286         }
287
288         if (native_port_num)
289                 *native_port_num = 1;
290
291         port = &ibdev->port[ib_port_num - 1];
292         if (!port)
293                 return NULL;
294
295         spin_lock(&port->mp.mpi_lock);
296         mpi = ibdev->port[ib_port_num - 1].mp.mpi;
297         if (mpi && !mpi->unaffiliate) {
298                 mdev = mpi->mdev;
299                 /* If it's the master no need to refcount, it'll exist
300                  * as long as the ib_dev exists.
301                  */
302                 if (!mpi->is_master)
303                         mpi->mdev_refcnt++;
304         }
305         spin_unlock(&port->mp.mpi_lock);
306
307         return mdev;
308 }
309
310 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
311 {
312         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
313                                                           port_num);
314         struct mlx5_ib_multiport_info *mpi;
315         struct mlx5_ib_port *port;
316
317         if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
318                 return;
319
320         port = &ibdev->port[port_num - 1];
321
322         spin_lock(&port->mp.mpi_lock);
323         mpi = ibdev->port[port_num - 1].mp.mpi;
324         if (mpi->is_master)
325                 goto out;
326
327         mpi->mdev_refcnt--;
328         if (mpi->unaffiliate)
329                 complete(&mpi->unref_comp);
330 out:
331         spin_unlock(&port->mp.mpi_lock);
332 }
333
334 static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
335                                     u8 *active_width)
336 {
337         switch (eth_proto_oper) {
338         case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
339         case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
340         case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
341         case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
342                 *active_width = IB_WIDTH_1X;
343                 *active_speed = IB_SPEED_SDR;
344                 break;
345         case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
346         case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
347         case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
348         case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
349         case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
350         case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
351         case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
352                 *active_width = IB_WIDTH_1X;
353                 *active_speed = IB_SPEED_QDR;
354                 break;
355         case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
356         case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
357         case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
358                 *active_width = IB_WIDTH_1X;
359                 *active_speed = IB_SPEED_EDR;
360                 break;
361         case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
362         case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
363         case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
364         case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
365                 *active_width = IB_WIDTH_4X;
366                 *active_speed = IB_SPEED_QDR;
367                 break;
368         case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
369         case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
370         case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
371                 *active_width = IB_WIDTH_1X;
372                 *active_speed = IB_SPEED_HDR;
373                 break;
374         case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
375                 *active_width = IB_WIDTH_4X;
376                 *active_speed = IB_SPEED_FDR;
377                 break;
378         case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
379         case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
380         case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
381         case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
382                 *active_width = IB_WIDTH_4X;
383                 *active_speed = IB_SPEED_EDR;
384                 break;
385         default:
386                 return -EINVAL;
387         }
388
389         return 0;
390 }
391
392 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
393                                 struct ib_port_attr *props)
394 {
395         struct mlx5_ib_dev *dev = to_mdev(device);
396         struct mlx5_core_dev *mdev;
397         struct net_device *ndev, *upper;
398         enum ib_mtu ndev_ib_mtu;
399         bool put_mdev = true;
400         u16 qkey_viol_cntr;
401         u32 eth_prot_oper;
402         u8 mdev_port_num;
403         int err;
404
405         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
406         if (!mdev) {
407                 /* This means the port isn't affiliated yet. Get the
408                  * info for the master port instead.
409                  */
410                 put_mdev = false;
411                 mdev = dev->mdev;
412                 mdev_port_num = 1;
413                 port_num = 1;
414         }
415
416         /* Possible bad flows are checked before filling out props so in case
417          * of an error it will still be zeroed out.
418          */
419         err = mlx5_query_port_eth_proto_oper(mdev, &eth_prot_oper,
420                                              mdev_port_num);
421         if (err)
422                 goto out;
423
424         props->active_width     = IB_WIDTH_4X;
425         props->active_speed     = IB_SPEED_QDR;
426
427         translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
428                                  &props->active_width);
429
430         props->port_cap_flags |= IB_PORT_CM_SUP;
431         props->ip_gids = true;
432
433         props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
434                                                 roce_address_table_size);
435         props->max_mtu          = IB_MTU_4096;
436         props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
437         props->pkey_tbl_len     = 1;
438         props->state            = IB_PORT_DOWN;
439         props->phys_state       = 3;
440
441         mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
442         props->qkey_viol_cntr = qkey_viol_cntr;
443
444         /* If this is a stub query for an unaffiliated port stop here */
445         if (!put_mdev)
446                 goto out;
447
448         ndev = mlx5_ib_get_netdev(device, port_num);
449         if (!ndev)
450                 goto out;
451
452         if (mlx5_lag_is_active(dev->mdev)) {
453                 rcu_read_lock();
454                 upper = netdev_master_upper_dev_get_rcu(ndev);
455                 if (upper) {
456                         dev_put(ndev);
457                         ndev = upper;
458                         dev_hold(ndev);
459                 }
460                 rcu_read_unlock();
461         }
462
463         if (netif_running(ndev) && netif_carrier_ok(ndev)) {
464                 props->state      = IB_PORT_ACTIVE;
465                 props->phys_state = 5;
466         }
467
468         ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
469
470         dev_put(ndev);
471
472         props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
473 out:
474         if (put_mdev)
475                 mlx5_ib_put_native_port_mdev(dev, port_num);
476         return err;
477 }
478
479 static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
480                          unsigned int index, const union ib_gid *gid,
481                          const struct ib_gid_attr *attr)
482 {
483         enum ib_gid_type gid_type = IB_GID_TYPE_IB;
484         u8 roce_version = 0;
485         u8 roce_l3_type = 0;
486         bool vlan = false;
487         u8 mac[ETH_ALEN];
488         u16 vlan_id = 0;
489
490         if (gid) {
491                 gid_type = attr->gid_type;
492                 ether_addr_copy(mac, attr->ndev->dev_addr);
493
494                 if (is_vlan_dev(attr->ndev)) {
495                         vlan = true;
496                         vlan_id = vlan_dev_vlan_id(attr->ndev);
497                 }
498         }
499
500         switch (gid_type) {
501         case IB_GID_TYPE_IB:
502                 roce_version = MLX5_ROCE_VERSION_1;
503                 break;
504         case IB_GID_TYPE_ROCE_UDP_ENCAP:
505                 roce_version = MLX5_ROCE_VERSION_2;
506                 if (ipv6_addr_v4mapped((void *)gid))
507                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
508                 else
509                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
510                 break;
511
512         default:
513                 mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
514         }
515
516         return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
517                                       roce_l3_type, gid->raw, mac, vlan,
518                                       vlan_id, port_num);
519 }
520
521 static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
522                            __always_unused void **context)
523 {
524         return set_roce_addr(to_mdev(attr->device), attr->port_num,
525                              attr->index, &attr->gid, attr);
526 }
527
528 static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
529                            __always_unused void **context)
530 {
531         return set_roce_addr(to_mdev(attr->device), attr->port_num,
532                              attr->index, NULL, NULL);
533 }
534
535 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev,
536                                const struct ib_gid_attr *attr)
537 {
538         if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
539                 return 0;
540
541         return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
542 }
543
544 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
545 {
546         if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
547                 return !MLX5_CAP_GEN(dev->mdev, ib_virt);
548         return 0;
549 }
550
551 enum {
552         MLX5_VPORT_ACCESS_METHOD_MAD,
553         MLX5_VPORT_ACCESS_METHOD_HCA,
554         MLX5_VPORT_ACCESS_METHOD_NIC,
555 };
556
557 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
558 {
559         if (mlx5_use_mad_ifc(to_mdev(ibdev)))
560                 return MLX5_VPORT_ACCESS_METHOD_MAD;
561
562         if (mlx5_ib_port_link_layer(ibdev, 1) ==
563             IB_LINK_LAYER_ETHERNET)
564                 return MLX5_VPORT_ACCESS_METHOD_NIC;
565
566         return MLX5_VPORT_ACCESS_METHOD_HCA;
567 }
568
569 static void get_atomic_caps(struct mlx5_ib_dev *dev,
570                             u8 atomic_size_qp,
571                             struct ib_device_attr *props)
572 {
573         u8 tmp;
574         u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
575         u8 atomic_req_8B_endianness_mode =
576                 MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
577
578         /* Check if HW supports 8 bytes standard atomic operations and capable
579          * of host endianness respond
580          */
581         tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
582         if (((atomic_operations & tmp) == tmp) &&
583             (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
584             (atomic_req_8B_endianness_mode)) {
585                 props->atomic_cap = IB_ATOMIC_HCA;
586         } else {
587                 props->atomic_cap = IB_ATOMIC_NONE;
588         }
589 }
590
591 static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
592                                struct ib_device_attr *props)
593 {
594         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
595
596         get_atomic_caps(dev, atomic_size_qp, props);
597 }
598
599 static void get_atomic_caps_dc(struct mlx5_ib_dev *dev,
600                                struct ib_device_attr *props)
601 {
602         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
603
604         get_atomic_caps(dev, atomic_size_qp, props);
605 }
606
607 bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev)
608 {
609         struct ib_device_attr props = {};
610
611         get_atomic_caps_dc(dev, &props);
612         return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false;
613 }
614 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
615                                         __be64 *sys_image_guid)
616 {
617         struct mlx5_ib_dev *dev = to_mdev(ibdev);
618         struct mlx5_core_dev *mdev = dev->mdev;
619         u64 tmp;
620         int err;
621
622         switch (mlx5_get_vport_access_method(ibdev)) {
623         case MLX5_VPORT_ACCESS_METHOD_MAD:
624                 return mlx5_query_mad_ifc_system_image_guid(ibdev,
625                                                             sys_image_guid);
626
627         case MLX5_VPORT_ACCESS_METHOD_HCA:
628                 err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
629                 break;
630
631         case MLX5_VPORT_ACCESS_METHOD_NIC:
632                 err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
633                 break;
634
635         default:
636                 return -EINVAL;
637         }
638
639         if (!err)
640                 *sys_image_guid = cpu_to_be64(tmp);
641
642         return err;
643
644 }
645
646 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
647                                 u16 *max_pkeys)
648 {
649         struct mlx5_ib_dev *dev = to_mdev(ibdev);
650         struct mlx5_core_dev *mdev = dev->mdev;
651
652         switch (mlx5_get_vport_access_method(ibdev)) {
653         case MLX5_VPORT_ACCESS_METHOD_MAD:
654                 return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
655
656         case MLX5_VPORT_ACCESS_METHOD_HCA:
657         case MLX5_VPORT_ACCESS_METHOD_NIC:
658                 *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
659                                                 pkey_table_size));
660                 return 0;
661
662         default:
663                 return -EINVAL;
664         }
665 }
666
667 static int mlx5_query_vendor_id(struct ib_device *ibdev,
668                                 u32 *vendor_id)
669 {
670         struct mlx5_ib_dev *dev = to_mdev(ibdev);
671
672         switch (mlx5_get_vport_access_method(ibdev)) {
673         case MLX5_VPORT_ACCESS_METHOD_MAD:
674                 return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
675
676         case MLX5_VPORT_ACCESS_METHOD_HCA:
677         case MLX5_VPORT_ACCESS_METHOD_NIC:
678                 return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
679
680         default:
681                 return -EINVAL;
682         }
683 }
684
685 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
686                                 __be64 *node_guid)
687 {
688         u64 tmp;
689         int err;
690
691         switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
692         case MLX5_VPORT_ACCESS_METHOD_MAD:
693                 return mlx5_query_mad_ifc_node_guid(dev, node_guid);
694
695         case MLX5_VPORT_ACCESS_METHOD_HCA:
696                 err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
697                 break;
698
699         case MLX5_VPORT_ACCESS_METHOD_NIC:
700                 err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
701                 break;
702
703         default:
704                 return -EINVAL;
705         }
706
707         if (!err)
708                 *node_guid = cpu_to_be64(tmp);
709
710         return err;
711 }
712
713 struct mlx5_reg_node_desc {
714         u8      desc[IB_DEVICE_NODE_DESC_MAX];
715 };
716
717 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
718 {
719         struct mlx5_reg_node_desc in;
720
721         if (mlx5_use_mad_ifc(dev))
722                 return mlx5_query_mad_ifc_node_desc(dev, node_desc);
723
724         memset(&in, 0, sizeof(in));
725
726         return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
727                                     sizeof(struct mlx5_reg_node_desc),
728                                     MLX5_REG_NODE_DESC, 0, 0);
729 }
730
731 static int mlx5_ib_query_device(struct ib_device *ibdev,
732                                 struct ib_device_attr *props,
733                                 struct ib_udata *uhw)
734 {
735         struct mlx5_ib_dev *dev = to_mdev(ibdev);
736         struct mlx5_core_dev *mdev = dev->mdev;
737         int err = -ENOMEM;
738         int max_sq_desc;
739         int max_rq_sg;
740         int max_sq_sg;
741         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
742         bool raw_support = !mlx5_core_mp_enabled(mdev);
743         struct mlx5_ib_query_device_resp resp = {};
744         size_t resp_len;
745         u64 max_tso;
746
747         resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
748         if (uhw->outlen && uhw->outlen < resp_len)
749                 return -EINVAL;
750         else
751                 resp.response_length = resp_len;
752
753         if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
754                 return -EINVAL;
755
756         memset(props, 0, sizeof(*props));
757         err = mlx5_query_system_image_guid(ibdev,
758                                            &props->sys_image_guid);
759         if (err)
760                 return err;
761
762         err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
763         if (err)
764                 return err;
765
766         err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
767         if (err)
768                 return err;
769
770         props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
771                 (fw_rev_min(dev->mdev) << 16) |
772                 fw_rev_sub(dev->mdev);
773         props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
774                 IB_DEVICE_PORT_ACTIVE_EVENT             |
775                 IB_DEVICE_SYS_IMAGE_GUID                |
776                 IB_DEVICE_RC_RNR_NAK_GEN;
777
778         if (MLX5_CAP_GEN(mdev, pkv))
779                 props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
780         if (MLX5_CAP_GEN(mdev, qkv))
781                 props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
782         if (MLX5_CAP_GEN(mdev, apm))
783                 props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
784         if (MLX5_CAP_GEN(mdev, xrc))
785                 props->device_cap_flags |= IB_DEVICE_XRC;
786         if (MLX5_CAP_GEN(mdev, imaicl)) {
787                 props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
788                                            IB_DEVICE_MEM_WINDOW_TYPE_2B;
789                 props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
790                 /* We support 'Gappy' memory registration too */
791                 props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
792         }
793         props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
794         if (MLX5_CAP_GEN(mdev, sho)) {
795                 props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
796                 /* At this stage no support for signature handover */
797                 props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
798                                       IB_PROT_T10DIF_TYPE_2 |
799                                       IB_PROT_T10DIF_TYPE_3;
800                 props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
801                                        IB_GUARD_T10DIF_CSUM;
802         }
803         if (MLX5_CAP_GEN(mdev, block_lb_mc))
804                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
805
806         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
807                 if (MLX5_CAP_ETH(mdev, csum_cap)) {
808                         /* Legacy bit to support old userspace libraries */
809                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
810                         props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
811                 }
812
813                 if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
814                         props->raw_packet_caps |=
815                                 IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
816
817                 if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
818                         max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
819                         if (max_tso) {
820                                 resp.tso_caps.max_tso = 1 << max_tso;
821                                 resp.tso_caps.supported_qpts |=
822                                         1 << IB_QPT_RAW_PACKET;
823                                 resp.response_length += sizeof(resp.tso_caps);
824                         }
825                 }
826
827                 if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
828                         resp.rss_caps.rx_hash_function =
829                                                 MLX5_RX_HASH_FUNC_TOEPLITZ;
830                         resp.rss_caps.rx_hash_fields_mask =
831                                                 MLX5_RX_HASH_SRC_IPV4 |
832                                                 MLX5_RX_HASH_DST_IPV4 |
833                                                 MLX5_RX_HASH_SRC_IPV6 |
834                                                 MLX5_RX_HASH_DST_IPV6 |
835                                                 MLX5_RX_HASH_SRC_PORT_TCP |
836                                                 MLX5_RX_HASH_DST_PORT_TCP |
837                                                 MLX5_RX_HASH_SRC_PORT_UDP |
838                                                 MLX5_RX_HASH_DST_PORT_UDP |
839                                                 MLX5_RX_HASH_INNER;
840                         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
841                             MLX5_ACCEL_IPSEC_CAP_DEVICE)
842                                 resp.rss_caps.rx_hash_fields_mask |=
843                                         MLX5_RX_HASH_IPSEC_SPI;
844                         resp.response_length += sizeof(resp.rss_caps);
845                 }
846         } else {
847                 if (field_avail(typeof(resp), tso_caps, uhw->outlen))
848                         resp.response_length += sizeof(resp.tso_caps);
849                 if (field_avail(typeof(resp), rss_caps, uhw->outlen))
850                         resp.response_length += sizeof(resp.rss_caps);
851         }
852
853         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
854                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
855                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
856         }
857
858         if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
859             MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
860             raw_support)
861                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
862
863         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
864             MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
865                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
866
867         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
868             MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
869             raw_support) {
870                 /* Legacy bit to support old userspace libraries */
871                 props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
872                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
873         }
874
875         if (MLX5_CAP_DEV_MEM(mdev, memic)) {
876                 props->max_dm_size =
877                         MLX5_CAP_DEV_MEM(mdev, max_memic_size);
878         }
879
880         if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
881                 props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
882
883         if (MLX5_CAP_GEN(mdev, end_pad))
884                 props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
885
886         props->vendor_part_id      = mdev->pdev->device;
887         props->hw_ver              = mdev->pdev->revision;
888
889         props->max_mr_size         = ~0ull;
890         props->page_size_cap       = ~(min_page_size - 1);
891         props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
892         props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
893         max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
894                      sizeof(struct mlx5_wqe_data_seg);
895         max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
896         max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
897                      sizeof(struct mlx5_wqe_raddr_seg)) /
898                 sizeof(struct mlx5_wqe_data_seg);
899         props->max_send_sge = max_sq_sg;
900         props->max_recv_sge = max_rq_sg;
901         props->max_sge_rd          = MLX5_MAX_SGE_RD;
902         props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
903         props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
904         props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
905         props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
906         props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
907         props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
908         props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
909         props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
910         props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
911         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
912         props->max_srq_sge         = max_rq_sg - 1;
913         props->max_fast_reg_page_list_len =
914                 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
915         get_atomic_caps_qp(dev, props);
916         props->masked_atomic_cap   = IB_ATOMIC_NONE;
917         props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
918         props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
919         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
920                                            props->max_mcast_grp;
921         props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
922         props->max_ah = INT_MAX;
923         props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
924         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
925
926 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
927         if (MLX5_CAP_GEN(mdev, pg))
928                 props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
929         props->odp_caps = dev->odp_caps;
930 #endif
931
932         if (MLX5_CAP_GEN(mdev, cd))
933                 props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
934
935         if (!mlx5_core_is_pf(mdev))
936                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
937
938         if (mlx5_ib_port_link_layer(ibdev, 1) ==
939             IB_LINK_LAYER_ETHERNET && raw_support) {
940                 props->rss_caps.max_rwq_indirection_tables =
941                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
942                 props->rss_caps.max_rwq_indirection_table_size =
943                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
944                 props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
945                 props->max_wq_type_rq =
946                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
947         }
948
949         if (MLX5_CAP_GEN(mdev, tag_matching)) {
950                 props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
951                 props->tm_caps.max_num_tags =
952                         (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
953                 props->tm_caps.flags = IB_TM_CAP_RC;
954                 props->tm_caps.max_ops =
955                         1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
956                 props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
957         }
958
959         if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
960                 props->cq_caps.max_cq_moderation_count =
961                                                 MLX5_MAX_CQ_COUNT;
962                 props->cq_caps.max_cq_moderation_period =
963                                                 MLX5_MAX_CQ_PERIOD;
964         }
965
966         if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
967                 resp.response_length += sizeof(resp.cqe_comp_caps);
968
969                 if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
970                         resp.cqe_comp_caps.max_num =
971                                 MLX5_CAP_GEN(dev->mdev,
972                                              cqe_compression_max_num);
973
974                         resp.cqe_comp_caps.supported_format =
975                                 MLX5_IB_CQE_RES_FORMAT_HASH |
976                                 MLX5_IB_CQE_RES_FORMAT_CSUM;
977
978                         if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
979                                 resp.cqe_comp_caps.supported_format |=
980                                         MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
981                 }
982         }
983
984         if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) &&
985             raw_support) {
986                 if (MLX5_CAP_QOS(mdev, packet_pacing) &&
987                     MLX5_CAP_GEN(mdev, qos)) {
988                         resp.packet_pacing_caps.qp_rate_limit_max =
989                                 MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
990                         resp.packet_pacing_caps.qp_rate_limit_min =
991                                 MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
992                         resp.packet_pacing_caps.supported_qpts |=
993                                 1 << IB_QPT_RAW_PACKET;
994                         if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
995                             MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
996                                 resp.packet_pacing_caps.cap_flags |=
997                                         MLX5_IB_PP_SUPPORT_BURST;
998                 }
999                 resp.response_length += sizeof(resp.packet_pacing_caps);
1000         }
1001
1002         if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
1003                         uhw->outlen)) {
1004                 if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
1005                         resp.mlx5_ib_support_multi_pkt_send_wqes =
1006                                 MLX5_IB_ALLOW_MPW;
1007
1008                 if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
1009                         resp.mlx5_ib_support_multi_pkt_send_wqes |=
1010                                 MLX5_IB_SUPPORT_EMPW;
1011
1012                 resp.response_length +=
1013                         sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
1014         }
1015
1016         if (field_avail(typeof(resp), flags, uhw->outlen)) {
1017                 resp.response_length += sizeof(resp.flags);
1018
1019                 if (MLX5_CAP_GEN(mdev, cqe_compression_128))
1020                         resp.flags |=
1021                                 MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
1022
1023                 if (MLX5_CAP_GEN(mdev, cqe_128_always))
1024                         resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
1025                 if (MLX5_CAP_GEN(mdev, qp_packet_based))
1026                         resp.flags |=
1027                                 MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
1028         }
1029
1030         if (field_avail(typeof(resp), sw_parsing_caps,
1031                         uhw->outlen)) {
1032                 resp.response_length += sizeof(resp.sw_parsing_caps);
1033                 if (MLX5_CAP_ETH(mdev, swp)) {
1034                         resp.sw_parsing_caps.sw_parsing_offloads |=
1035                                 MLX5_IB_SW_PARSING;
1036
1037                         if (MLX5_CAP_ETH(mdev, swp_csum))
1038                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1039                                         MLX5_IB_SW_PARSING_CSUM;
1040
1041                         if (MLX5_CAP_ETH(mdev, swp_lso))
1042                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1043                                         MLX5_IB_SW_PARSING_LSO;
1044
1045                         if (resp.sw_parsing_caps.sw_parsing_offloads)
1046                                 resp.sw_parsing_caps.supported_qpts =
1047                                         BIT(IB_QPT_RAW_PACKET);
1048                 }
1049         }
1050
1051         if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) &&
1052             raw_support) {
1053                 resp.response_length += sizeof(resp.striding_rq_caps);
1054                 if (MLX5_CAP_GEN(mdev, striding_rq)) {
1055                         resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
1056                                 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
1057                         resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
1058                                 MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
1059                         resp.striding_rq_caps.min_single_wqe_log_num_of_strides =
1060                                 MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1061                         resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
1062                                 MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
1063                         resp.striding_rq_caps.supported_qpts =
1064                                 BIT(IB_QPT_RAW_PACKET);
1065                 }
1066         }
1067
1068         if (field_avail(typeof(resp), tunnel_offloads_caps,
1069                         uhw->outlen)) {
1070                 resp.response_length += sizeof(resp.tunnel_offloads_caps);
1071                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
1072                         resp.tunnel_offloads_caps |=
1073                                 MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
1074                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
1075                         resp.tunnel_offloads_caps |=
1076                                 MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
1077                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
1078                         resp.tunnel_offloads_caps |=
1079                                 MLX5_IB_TUNNELED_OFFLOADS_GRE;
1080                 if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
1081                     MLX5_FLEX_PROTO_CW_MPLS_GRE)
1082                         resp.tunnel_offloads_caps |=
1083                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
1084                 if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
1085                     MLX5_FLEX_PROTO_CW_MPLS_UDP)
1086                         resp.tunnel_offloads_caps |=
1087                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
1088         }
1089
1090         if (uhw->outlen) {
1091                 err = ib_copy_to_udata(uhw, &resp, resp.response_length);
1092
1093                 if (err)
1094                         return err;
1095         }
1096
1097         return 0;
1098 }
1099
1100 enum mlx5_ib_width {
1101         MLX5_IB_WIDTH_1X        = 1 << 0,
1102         MLX5_IB_WIDTH_2X        = 1 << 1,
1103         MLX5_IB_WIDTH_4X        = 1 << 2,
1104         MLX5_IB_WIDTH_8X        = 1 << 3,
1105         MLX5_IB_WIDTH_12X       = 1 << 4
1106 };
1107
1108 static void translate_active_width(struct ib_device *ibdev, u8 active_width,
1109                                   u8 *ib_width)
1110 {
1111         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1112
1113         if (active_width & MLX5_IB_WIDTH_1X)
1114                 *ib_width = IB_WIDTH_1X;
1115         else if (active_width & MLX5_IB_WIDTH_2X)
1116                 *ib_width = IB_WIDTH_2X;
1117         else if (active_width & MLX5_IB_WIDTH_4X)
1118                 *ib_width = IB_WIDTH_4X;
1119         else if (active_width & MLX5_IB_WIDTH_8X)
1120                 *ib_width = IB_WIDTH_8X;
1121         else if (active_width & MLX5_IB_WIDTH_12X)
1122                 *ib_width = IB_WIDTH_12X;
1123         else {
1124                 mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
1125                             (int)active_width);
1126                 *ib_width = IB_WIDTH_4X;
1127         }
1128
1129         return;
1130 }
1131
1132 static int mlx5_mtu_to_ib_mtu(int mtu)
1133 {
1134         switch (mtu) {
1135         case 256: return 1;
1136         case 512: return 2;
1137         case 1024: return 3;
1138         case 2048: return 4;
1139         case 4096: return 5;
1140         default:
1141                 pr_warn("invalid mtu\n");
1142                 return -1;
1143         }
1144 }
1145
1146 enum ib_max_vl_num {
1147         __IB_MAX_VL_0           = 1,
1148         __IB_MAX_VL_0_1         = 2,
1149         __IB_MAX_VL_0_3         = 3,
1150         __IB_MAX_VL_0_7         = 4,
1151         __IB_MAX_VL_0_14        = 5,
1152 };
1153
1154 enum mlx5_vl_hw_cap {
1155         MLX5_VL_HW_0    = 1,
1156         MLX5_VL_HW_0_1  = 2,
1157         MLX5_VL_HW_0_2  = 3,
1158         MLX5_VL_HW_0_3  = 4,
1159         MLX5_VL_HW_0_4  = 5,
1160         MLX5_VL_HW_0_5  = 6,
1161         MLX5_VL_HW_0_6  = 7,
1162         MLX5_VL_HW_0_7  = 8,
1163         MLX5_VL_HW_0_14 = 15
1164 };
1165
1166 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
1167                                 u8 *max_vl_num)
1168 {
1169         switch (vl_hw_cap) {
1170         case MLX5_VL_HW_0:
1171                 *max_vl_num = __IB_MAX_VL_0;
1172                 break;
1173         case MLX5_VL_HW_0_1:
1174                 *max_vl_num = __IB_MAX_VL_0_1;
1175                 break;
1176         case MLX5_VL_HW_0_3:
1177                 *max_vl_num = __IB_MAX_VL_0_3;
1178                 break;
1179         case MLX5_VL_HW_0_7:
1180                 *max_vl_num = __IB_MAX_VL_0_7;
1181                 break;
1182         case MLX5_VL_HW_0_14:
1183                 *max_vl_num = __IB_MAX_VL_0_14;
1184                 break;
1185
1186         default:
1187                 return -EINVAL;
1188         }
1189
1190         return 0;
1191 }
1192
1193 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
1194                                struct ib_port_attr *props)
1195 {
1196         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1197         struct mlx5_core_dev *mdev = dev->mdev;
1198         struct mlx5_hca_vport_context *rep;
1199         u16 max_mtu;
1200         u16 oper_mtu;
1201         int err;
1202         u8 ib_link_width_oper;
1203         u8 vl_hw_cap;
1204
1205         rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1206         if (!rep) {
1207                 err = -ENOMEM;
1208                 goto out;
1209         }
1210
1211         /* props being zeroed by the caller, avoid zeroing it here */
1212
1213         err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
1214         if (err)
1215                 goto out;
1216
1217         props->lid              = rep->lid;
1218         props->lmc              = rep->lmc;
1219         props->sm_lid           = rep->sm_lid;
1220         props->sm_sl            = rep->sm_sl;
1221         props->state            = rep->vport_state;
1222         props->phys_state       = rep->port_physical_state;
1223         props->port_cap_flags   = rep->cap_mask1;
1224         props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
1225         props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
1226         props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
1227         props->bad_pkey_cntr    = rep->pkey_violation_counter;
1228         props->qkey_viol_cntr   = rep->qkey_violation_counter;
1229         props->subnet_timeout   = rep->subnet_timeout;
1230         props->init_type_reply  = rep->init_type_reply;
1231
1232         if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
1233                 props->port_cap_flags2 = rep->cap_mask2;
1234
1235         err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
1236         if (err)
1237                 goto out;
1238
1239         translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
1240
1241         err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
1242         if (err)
1243                 goto out;
1244
1245         mlx5_query_port_max_mtu(mdev, &max_mtu, port);
1246
1247         props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
1248
1249         mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
1250
1251         props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
1252
1253         err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
1254         if (err)
1255                 goto out;
1256
1257         err = translate_max_vl_num(ibdev, vl_hw_cap,
1258                                    &props->max_vl_num);
1259 out:
1260         kfree(rep);
1261         return err;
1262 }
1263
1264 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
1265                        struct ib_port_attr *props)
1266 {
1267         unsigned int count;
1268         int ret;
1269
1270         switch (mlx5_get_vport_access_method(ibdev)) {
1271         case MLX5_VPORT_ACCESS_METHOD_MAD:
1272                 ret = mlx5_query_mad_ifc_port(ibdev, port, props);
1273                 break;
1274
1275         case MLX5_VPORT_ACCESS_METHOD_HCA:
1276                 ret = mlx5_query_hca_port(ibdev, port, props);
1277                 break;
1278
1279         case MLX5_VPORT_ACCESS_METHOD_NIC:
1280                 ret = mlx5_query_port_roce(ibdev, port, props);
1281                 break;
1282
1283         default:
1284                 ret = -EINVAL;
1285         }
1286
1287         if (!ret && props) {
1288                 struct mlx5_ib_dev *dev = to_mdev(ibdev);
1289                 struct mlx5_core_dev *mdev;
1290                 bool put_mdev = true;
1291
1292                 mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
1293                 if (!mdev) {
1294                         /* If the port isn't affiliated yet query the master.
1295                          * The master and slave will have the same values.
1296                          */
1297                         mdev = dev->mdev;
1298                         port = 1;
1299                         put_mdev = false;
1300                 }
1301                 count = mlx5_core_reserved_gids_count(mdev);
1302                 if (put_mdev)
1303                         mlx5_ib_put_native_port_mdev(dev, port);
1304                 props->gid_tbl_len -= count;
1305         }
1306         return ret;
1307 }
1308
1309 static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
1310                                   struct ib_port_attr *props)
1311 {
1312         int ret;
1313
1314         /* Only link layer == ethernet is valid for representors */
1315         ret = mlx5_query_port_roce(ibdev, port, props);
1316         if (ret || !props)
1317                 return ret;
1318
1319         /* We don't support GIDS */
1320         props->gid_tbl_len = 0;
1321
1322         return ret;
1323 }
1324
1325 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
1326                              union ib_gid *gid)
1327 {
1328         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1329         struct mlx5_core_dev *mdev = dev->mdev;
1330
1331         switch (mlx5_get_vport_access_method(ibdev)) {
1332         case MLX5_VPORT_ACCESS_METHOD_MAD:
1333                 return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
1334
1335         case MLX5_VPORT_ACCESS_METHOD_HCA:
1336                 return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
1337
1338         default:
1339                 return -EINVAL;
1340         }
1341
1342 }
1343
1344 static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
1345                                    u16 index, u16 *pkey)
1346 {
1347         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1348         struct mlx5_core_dev *mdev;
1349         bool put_mdev = true;
1350         u8 mdev_port_num;
1351         int err;
1352
1353         mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
1354         if (!mdev) {
1355                 /* The port isn't affiliated yet, get the PKey from the master
1356                  * port. For RoCE the PKey tables will be the same.
1357                  */
1358                 put_mdev = false;
1359                 mdev = dev->mdev;
1360                 mdev_port_num = 1;
1361         }
1362
1363         err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
1364                                         index, pkey);
1365         if (put_mdev)
1366                 mlx5_ib_put_native_port_mdev(dev, port);
1367
1368         return err;
1369 }
1370
1371 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1372                               u16 *pkey)
1373 {
1374         switch (mlx5_get_vport_access_method(ibdev)) {
1375         case MLX5_VPORT_ACCESS_METHOD_MAD:
1376                 return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
1377
1378         case MLX5_VPORT_ACCESS_METHOD_HCA:
1379         case MLX5_VPORT_ACCESS_METHOD_NIC:
1380                 return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
1381         default:
1382                 return -EINVAL;
1383         }
1384 }
1385
1386 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
1387                                  struct ib_device_modify *props)
1388 {
1389         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1390         struct mlx5_reg_node_desc in;
1391         struct mlx5_reg_node_desc out;
1392         int err;
1393
1394         if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
1395                 return -EOPNOTSUPP;
1396
1397         if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
1398                 return 0;
1399
1400         /*
1401          * If possible, pass node desc to FW, so it can generate
1402          * a 144 trap.  If cmd fails, just ignore.
1403          */
1404         memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1405         err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1406                                    sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1407         if (err)
1408                 return err;
1409
1410         memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1411
1412         return err;
1413 }
1414
1415 static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
1416                                 u32 value)
1417 {
1418         struct mlx5_hca_vport_context ctx = {};
1419         struct mlx5_core_dev *mdev;
1420         u8 mdev_port_num;
1421         int err;
1422
1423         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
1424         if (!mdev)
1425                 return -ENODEV;
1426
1427         err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
1428         if (err)
1429                 goto out;
1430
1431         if (~ctx.cap_mask1_perm & mask) {
1432                 mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1433                              mask, ctx.cap_mask1_perm);
1434                 err = -EINVAL;
1435                 goto out;
1436         }
1437
1438         ctx.cap_mask1 = value;
1439         ctx.cap_mask1_perm = mask;
1440         err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
1441                                                  0, &ctx);
1442
1443 out:
1444         mlx5_ib_put_native_port_mdev(dev, port_num);
1445
1446         return err;
1447 }
1448
1449 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1450                                struct ib_port_modify *props)
1451 {
1452         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1453         struct ib_port_attr attr;
1454         u32 tmp;
1455         int err;
1456         u32 change_mask;
1457         u32 value;
1458         bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1459                       IB_LINK_LAYER_INFINIBAND);
1460
1461         /* CM layer calls ib_modify_port() regardless of the link layer. For
1462          * Ethernet ports, qkey violation and Port capabilities are meaningless.
1463          */
1464         if (!is_ib)
1465                 return 0;
1466
1467         if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1468                 change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1469                 value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1470                 return set_port_caps_atomic(dev, port, change_mask, value);
1471         }
1472
1473         mutex_lock(&dev->cap_mask_mutex);
1474
1475         err = ib_query_port(ibdev, port, &attr);
1476         if (err)
1477                 goto out;
1478
1479         tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1480                 ~props->clr_port_cap_mask;
1481
1482         err = mlx5_set_port_caps(dev->mdev, port, tmp);
1483
1484 out:
1485         mutex_unlock(&dev->cap_mask_mutex);
1486         return err;
1487 }
1488
1489 static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1490 {
1491         mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1492                     caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1493 }
1494
1495 static u16 calc_dynamic_bfregs(int uars_per_sys_page)
1496 {
1497         /* Large page with non 4k uar support might limit the dynamic size */
1498         if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
1499                 return MLX5_MIN_DYN_BFREGS;
1500
1501         return MLX5_MAX_DYN_BFREGS;
1502 }
1503
1504 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1505                              struct mlx5_ib_alloc_ucontext_req_v2 *req,
1506                              struct mlx5_bfreg_info *bfregi)
1507 {
1508         int uars_per_sys_page;
1509         int bfregs_per_sys_page;
1510         int ref_bfregs = req->total_num_bfregs;
1511
1512         if (req->total_num_bfregs == 0)
1513                 return -EINVAL;
1514
1515         BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1516         BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1517
1518         if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1519                 return -ENOMEM;
1520
1521         uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1522         bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1523         /* This holds the required static allocation asked by the user */
1524         req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1525         if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1526                 return -EINVAL;
1527
1528         bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1529         bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
1530         bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
1531         bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
1532
1533         mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
1534                     MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1535                     lib_uar_4k ? "yes" : "no", ref_bfregs,
1536                     req->total_num_bfregs, bfregi->total_num_bfregs,
1537                     bfregi->num_sys_pages);
1538
1539         return 0;
1540 }
1541
1542 static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1543 {
1544         struct mlx5_bfreg_info *bfregi;
1545         int err;
1546         int i;
1547
1548         bfregi = &context->bfregi;
1549         for (i = 0; i < bfregi->num_static_sys_pages; i++) {
1550                 err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1551                 if (err)
1552                         goto error;
1553
1554                 mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1555         }
1556
1557         for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
1558                 bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
1559
1560         return 0;
1561
1562 error:
1563         for (--i; i >= 0; i--)
1564                 if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1565                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1566
1567         return err;
1568 }
1569
1570 static void deallocate_uars(struct mlx5_ib_dev *dev,
1571                             struct mlx5_ib_ucontext *context)
1572 {
1573         struct mlx5_bfreg_info *bfregi;
1574         int i;
1575
1576         bfregi = &context->bfregi;
1577         for (i = 0; i < bfregi->num_sys_pages; i++)
1578                 if (i < bfregi->num_static_sys_pages ||
1579                     bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
1580                         mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1581 }
1582
1583 int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1584 {
1585         int err = 0;
1586
1587         mutex_lock(&dev->lb.mutex);
1588         if (td)
1589                 dev->lb.user_td++;
1590         if (qp)
1591                 dev->lb.qps++;
1592
1593         if (dev->lb.user_td == 2 ||
1594             dev->lb.qps == 1) {
1595                 if (!dev->lb.enabled) {
1596                         err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
1597                         dev->lb.enabled = true;
1598                 }
1599         }
1600
1601         mutex_unlock(&dev->lb.mutex);
1602
1603         return err;
1604 }
1605
1606 void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1607 {
1608         mutex_lock(&dev->lb.mutex);
1609         if (td)
1610                 dev->lb.user_td--;
1611         if (qp)
1612                 dev->lb.qps--;
1613
1614         if (dev->lb.user_td == 1 &&
1615             dev->lb.qps == 0) {
1616                 if (dev->lb.enabled) {
1617                         mlx5_nic_vport_update_local_lb(dev->mdev, false);
1618                         dev->lb.enabled = false;
1619                 }
1620         }
1621
1622         mutex_unlock(&dev->lb.mutex);
1623 }
1624
1625 static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
1626                                           u16 uid)
1627 {
1628         int err;
1629
1630         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1631                 return 0;
1632
1633         err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
1634         if (err)
1635                 return err;
1636
1637         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1638             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1639              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1640                 return err;
1641
1642         return mlx5_ib_enable_lb(dev, true, false);
1643 }
1644
1645 static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
1646                                              u16 uid)
1647 {
1648         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1649                 return;
1650
1651         mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
1652
1653         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1654             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1655              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1656                 return;
1657
1658         mlx5_ib_disable_lb(dev, true, false);
1659 }
1660
1661 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
1662                                                   struct ib_udata *udata)
1663 {
1664         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1665         struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1666         struct mlx5_ib_alloc_ucontext_resp resp = {};
1667         struct mlx5_core_dev *mdev = dev->mdev;
1668         struct mlx5_ib_ucontext *context;
1669         struct mlx5_bfreg_info *bfregi;
1670         int ver;
1671         int err;
1672         size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1673                                      max_cqe_version);
1674         u32 dump_fill_mkey;
1675         bool lib_uar_4k;
1676
1677         if (!dev->ib_active)
1678                 return ERR_PTR(-EAGAIN);
1679
1680         if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1681                 ver = 0;
1682         else if (udata->inlen >= min_req_v2)
1683                 ver = 2;
1684         else
1685                 return ERR_PTR(-EINVAL);
1686
1687         err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1688         if (err)
1689                 return ERR_PTR(err);
1690
1691         if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
1692                 return ERR_PTR(-EOPNOTSUPP);
1693
1694         if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1695                 return ERR_PTR(-EOPNOTSUPP);
1696
1697         req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1698                                     MLX5_NON_FP_BFREGS_PER_UAR);
1699         if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1700                 return ERR_PTR(-EINVAL);
1701
1702         resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1703         if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1704                 resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1705         resp.cache_line_size = cache_line_size();
1706         resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1707         resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1708         resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1709         resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1710         resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1711         resp.cqe_version = min_t(__u8,
1712                                  (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1713                                  req.max_cqe_version);
1714         resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1715                                 MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1716         resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1717                                         MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1718         resp.response_length = min(offsetof(typeof(resp), response_length) +
1719                                    sizeof(resp.response_length), udata->outlen);
1720
1721         if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) {
1722                 if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS))
1723                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
1724                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
1725                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
1726                 if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
1727                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
1728                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
1729                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
1730                 /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
1731         }
1732
1733         context = kzalloc(sizeof(*context), GFP_KERNEL);
1734         if (!context)
1735                 return ERR_PTR(-ENOMEM);
1736
1737         lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1738         bfregi = &context->bfregi;
1739
1740         /* updates req->total_num_bfregs */
1741         err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
1742         if (err)
1743                 goto out_ctx;
1744
1745         mutex_init(&bfregi->lock);
1746         bfregi->lib_uar_4k = lib_uar_4k;
1747         bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
1748                                 GFP_KERNEL);
1749         if (!bfregi->count) {
1750                 err = -ENOMEM;
1751                 goto out_ctx;
1752         }
1753
1754         bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1755                                     sizeof(*bfregi->sys_pages),
1756                                     GFP_KERNEL);
1757         if (!bfregi->sys_pages) {
1758                 err = -ENOMEM;
1759                 goto out_count;
1760         }
1761
1762         err = allocate_uars(dev, context);
1763         if (err)
1764                 goto out_sys_pages;
1765
1766 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1767         context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
1768 #endif
1769
1770         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1771                 err = mlx5_ib_devx_create(dev, true);
1772                 if (err < 0)
1773                         goto out_uars;
1774                 context->devx_uid = err;
1775         }
1776
1777         err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
1778                                              context->devx_uid);
1779         if (err)
1780                 goto out_devx;
1781
1782         if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1783                 err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey);
1784                 if (err)
1785                         goto out_mdev;
1786         }
1787
1788         INIT_LIST_HEAD(&context->db_page_list);
1789         mutex_init(&context->db_page_mutex);
1790
1791         resp.tot_bfregs = req.total_num_bfregs;
1792         resp.num_ports = dev->num_ports;
1793
1794         if (field_avail(typeof(resp), cqe_version, udata->outlen))
1795                 resp.response_length += sizeof(resp.cqe_version);
1796
1797         if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1798                 resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1799                                       MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1800                 resp.response_length += sizeof(resp.cmds_supp_uhw);
1801         }
1802
1803         if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
1804                 if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
1805                         mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
1806                         resp.eth_min_inline++;
1807                 }
1808                 resp.response_length += sizeof(resp.eth_min_inline);
1809         }
1810
1811         if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
1812                 if (mdev->clock_info)
1813                         resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
1814                 resp.response_length += sizeof(resp.clock_info_versions);
1815         }
1816
1817         /*
1818          * We don't want to expose information from the PCI bar that is located
1819          * after 4096 bytes, so if the arch only supports larger pages, let's
1820          * pretend we don't support reading the HCA's core clock. This is also
1821          * forced by mmap function.
1822          */
1823         if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1824                 if (PAGE_SIZE <= 4096) {
1825                         resp.comp_mask |=
1826                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1827                         resp.hca_core_clock_offset =
1828                                 offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1829                 }
1830                 resp.response_length += sizeof(resp.hca_core_clock_offset);
1831         }
1832
1833         if (field_avail(typeof(resp), log_uar_size, udata->outlen))
1834                 resp.response_length += sizeof(resp.log_uar_size);
1835
1836         if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
1837                 resp.response_length += sizeof(resp.num_uars_per_page);
1838
1839         if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
1840                 resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
1841                 resp.response_length += sizeof(resp.num_dyn_bfregs);
1842         }
1843
1844         if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) {
1845                 if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1846                         resp.dump_fill_mkey = dump_fill_mkey;
1847                         resp.comp_mask |=
1848                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
1849                 }
1850                 resp.response_length += sizeof(resp.dump_fill_mkey);
1851         }
1852
1853         err = ib_copy_to_udata(udata, &resp, resp.response_length);
1854         if (err)
1855                 goto out_mdev;
1856
1857         bfregi->ver = ver;
1858         bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1859         context->cqe_version = resp.cqe_version;
1860         context->lib_caps = req.lib_caps;
1861         print_lib_caps(dev, context->lib_caps);
1862
1863         if (mlx5_lag_is_active(dev->mdev)) {
1864                 u8 port = mlx5_core_native_port_num(dev->mdev);
1865
1866                 atomic_set(&context->tx_port_affinity,
1867                            atomic_add_return(
1868                                    1, &dev->roce[port].tx_port_affinity));
1869         }
1870
1871         return &context->ibucontext;
1872
1873 out_mdev:
1874         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1875 out_devx:
1876         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
1877                 mlx5_ib_devx_destroy(dev, context->devx_uid);
1878
1879 out_uars:
1880         deallocate_uars(dev, context);
1881
1882 out_sys_pages:
1883         kfree(bfregi->sys_pages);
1884
1885 out_count:
1886         kfree(bfregi->count);
1887
1888 out_ctx:
1889         kfree(context);
1890
1891         return ERR_PTR(err);
1892 }
1893
1894 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1895 {
1896         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1897         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1898         struct mlx5_bfreg_info *bfregi;
1899
1900 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1901         /* All umem's must be destroyed before destroying the ucontext. */
1902         mutex_lock(&ibcontext->per_mm_list_lock);
1903         WARN_ON(!list_empty(&ibcontext->per_mm_list));
1904         mutex_unlock(&ibcontext->per_mm_list_lock);
1905 #endif
1906
1907         bfregi = &context->bfregi;
1908         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1909
1910         if (context->devx_uid)
1911                 mlx5_ib_devx_destroy(dev, context->devx_uid);
1912
1913         deallocate_uars(dev, context);
1914         kfree(bfregi->sys_pages);
1915         kfree(bfregi->count);
1916         kfree(context);
1917
1918         return 0;
1919 }
1920
1921 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
1922                                  int uar_idx)
1923 {
1924         int fw_uars_per_page;
1925
1926         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
1927
1928         return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
1929 }
1930
1931 static int get_command(unsigned long offset)
1932 {
1933         return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1934 }
1935
1936 static int get_arg(unsigned long offset)
1937 {
1938         return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1939 }
1940
1941 static int get_index(unsigned long offset)
1942 {
1943         return get_arg(offset);
1944 }
1945
1946 /* Index resides in an extra byte to enable larger values than 255 */
1947 static int get_extended_index(unsigned long offset)
1948 {
1949         return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
1950 }
1951
1952
1953 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
1954 {
1955 }
1956
1957 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
1958 {
1959         switch (cmd) {
1960         case MLX5_IB_MMAP_WC_PAGE:
1961                 return "WC";
1962         case MLX5_IB_MMAP_REGULAR_PAGE:
1963                 return "best effort WC";
1964         case MLX5_IB_MMAP_NC_PAGE:
1965                 return "NC";
1966         case MLX5_IB_MMAP_DEVICE_MEM:
1967                 return "Device Memory";
1968         default:
1969                 return NULL;
1970         }
1971 }
1972
1973 static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
1974                                         struct vm_area_struct *vma,
1975                                         struct mlx5_ib_ucontext *context)
1976 {
1977         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1978                 return -EINVAL;
1979
1980         if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
1981                 return -EOPNOTSUPP;
1982
1983         if (vma->vm_flags & VM_WRITE)
1984                 return -EPERM;
1985
1986         if (!dev->mdev->clock_info_page)
1987                 return -EOPNOTSUPP;
1988
1989         return rdma_user_mmap_page(&context->ibucontext, vma,
1990                                    dev->mdev->clock_info_page, PAGE_SIZE);
1991 }
1992
1993 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
1994                     struct vm_area_struct *vma,
1995                     struct mlx5_ib_ucontext *context)
1996 {
1997         struct mlx5_bfreg_info *bfregi = &context->bfregi;
1998         int err;
1999         unsigned long idx;
2000         phys_addr_t pfn;
2001         pgprot_t prot;
2002         u32 bfreg_dyn_idx = 0;
2003         u32 uar_index;
2004         int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
2005         int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
2006                                 bfregi->num_static_sys_pages;
2007
2008         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2009                 return -EINVAL;
2010
2011         if (dyn_uar)
2012                 idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
2013         else
2014                 idx = get_index(vma->vm_pgoff);
2015
2016         if (idx >= max_valid_idx) {
2017                 mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
2018                              idx, max_valid_idx);
2019                 return -EINVAL;
2020         }
2021
2022         switch (cmd) {
2023         case MLX5_IB_MMAP_WC_PAGE:
2024         case MLX5_IB_MMAP_ALLOC_WC:
2025 /* Some architectures don't support WC memory */
2026 #if defined(CONFIG_X86)
2027                 if (!pat_enabled())
2028                         return -EPERM;
2029 #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
2030                         return -EPERM;
2031 #endif
2032         /* fall through */
2033         case MLX5_IB_MMAP_REGULAR_PAGE:
2034                 /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
2035                 prot = pgprot_writecombine(vma->vm_page_prot);
2036                 break;
2037         case MLX5_IB_MMAP_NC_PAGE:
2038                 prot = pgprot_noncached(vma->vm_page_prot);
2039                 break;
2040         default:
2041                 return -EINVAL;
2042         }
2043
2044         if (dyn_uar) {
2045                 int uars_per_page;
2046
2047                 uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
2048                 bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
2049                 if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
2050                         mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
2051                                      bfreg_dyn_idx, bfregi->total_num_bfregs);
2052                         return -EINVAL;
2053                 }
2054
2055                 mutex_lock(&bfregi->lock);
2056                 /* Fail if uar already allocated, first bfreg index of each
2057                  * page holds its count.
2058                  */
2059                 if (bfregi->count[bfreg_dyn_idx]) {
2060                         mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
2061                         mutex_unlock(&bfregi->lock);
2062                         return -EINVAL;
2063                 }
2064
2065                 bfregi->count[bfreg_dyn_idx]++;
2066                 mutex_unlock(&bfregi->lock);
2067
2068                 err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
2069                 if (err) {
2070                         mlx5_ib_warn(dev, "UAR alloc failed\n");
2071                         goto free_bfreg;
2072                 }
2073         } else {
2074                 uar_index = bfregi->sys_pages[idx];
2075         }
2076
2077         pfn = uar_index2pfn(dev, uar_index);
2078         mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
2079
2080         err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
2081                                 prot);
2082         if (err) {
2083                 mlx5_ib_err(dev,
2084                             "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
2085                             err, mmap_cmd2str(cmd));
2086                 goto err;
2087         }
2088
2089         if (dyn_uar)
2090                 bfregi->sys_pages[idx] = uar_index;
2091         return 0;
2092
2093 err:
2094         if (!dyn_uar)
2095                 return err;
2096
2097         mlx5_cmd_free_uar(dev->mdev, idx);
2098
2099 free_bfreg:
2100         mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
2101
2102         return err;
2103 }
2104
2105 static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
2106 {
2107         struct mlx5_ib_ucontext *mctx = to_mucontext(context);
2108         struct mlx5_ib_dev *dev = to_mdev(context->device);
2109         u16 page_idx = get_extended_index(vma->vm_pgoff);
2110         size_t map_size = vma->vm_end - vma->vm_start;
2111         u32 npages = map_size >> PAGE_SHIFT;
2112         phys_addr_t pfn;
2113
2114         if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) !=
2115             page_idx + npages)
2116                 return -EINVAL;
2117
2118         pfn = ((pci_resource_start(dev->mdev->pdev, 0) +
2119               MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
2120               PAGE_SHIFT) +
2121               page_idx;
2122         return rdma_user_mmap_io(context, vma, pfn, map_size,
2123                                  pgprot_writecombine(vma->vm_page_prot));
2124 }
2125
2126 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
2127 {
2128         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2129         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2130         unsigned long command;
2131         phys_addr_t pfn;
2132
2133         command = get_command(vma->vm_pgoff);
2134         switch (command) {
2135         case MLX5_IB_MMAP_WC_PAGE:
2136         case MLX5_IB_MMAP_NC_PAGE:
2137         case MLX5_IB_MMAP_REGULAR_PAGE:
2138         case MLX5_IB_MMAP_ALLOC_WC:
2139                 return uar_mmap(dev, command, vma, context);
2140
2141         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
2142                 return -ENOSYS;
2143
2144         case MLX5_IB_MMAP_CORE_CLOCK:
2145                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2146                         return -EINVAL;
2147
2148                 if (vma->vm_flags & VM_WRITE)
2149                         return -EPERM;
2150
2151                 /* Don't expose to user-space information it shouldn't have */
2152                 if (PAGE_SIZE > 4096)
2153                         return -EOPNOTSUPP;
2154
2155                 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
2156                 pfn = (dev->mdev->iseg_base +
2157                        offsetof(struct mlx5_init_seg, internal_timer_h)) >>
2158                         PAGE_SHIFT;
2159                 if (io_remap_pfn_range(vma, vma->vm_start, pfn,
2160                                        PAGE_SIZE, vma->vm_page_prot))
2161                         return -EAGAIN;
2162                 break;
2163         case MLX5_IB_MMAP_CLOCK_INFO:
2164                 return mlx5_ib_mmap_clock_info_page(dev, vma, context);
2165
2166         case MLX5_IB_MMAP_DEVICE_MEM:
2167                 return dm_mmap(ibcontext, vma);
2168
2169         default:
2170                 return -EINVAL;
2171         }
2172
2173         return 0;
2174 }
2175
2176 struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
2177                                struct ib_ucontext *context,
2178                                struct ib_dm_alloc_attr *attr,
2179                                struct uverbs_attr_bundle *attrs)
2180 {
2181         u64 act_size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
2182         struct mlx5_memic *memic = &to_mdev(ibdev)->memic;
2183         phys_addr_t memic_addr;
2184         struct mlx5_ib_dm *dm;
2185         u64 start_offset;
2186         u32 page_idx;
2187         int err;
2188
2189         dm = kzalloc(sizeof(*dm), GFP_KERNEL);
2190         if (!dm)
2191                 return ERR_PTR(-ENOMEM);
2192
2193         mlx5_ib_dbg(to_mdev(ibdev), "alloc_memic req: user_length=0x%llx act_length=0x%llx log_alignment=%d\n",
2194                     attr->length, act_size, attr->alignment);
2195
2196         err = mlx5_cmd_alloc_memic(memic, &memic_addr,
2197                                    act_size, attr->alignment);
2198         if (err)
2199                 goto err_free;
2200
2201         start_offset = memic_addr & ~PAGE_MASK;
2202         page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) -
2203                     MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
2204                     PAGE_SHIFT;
2205
2206         err = uverbs_copy_to(attrs,
2207                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2208                              &start_offset, sizeof(start_offset));
2209         if (err)
2210                 goto err_dealloc;
2211
2212         err = uverbs_copy_to(attrs,
2213                              MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
2214                              &page_idx, sizeof(page_idx));
2215         if (err)
2216                 goto err_dealloc;
2217
2218         bitmap_set(to_mucontext(context)->dm_pages, page_idx,
2219                    DIV_ROUND_UP(act_size, PAGE_SIZE));
2220
2221         dm->dev_addr = memic_addr;
2222
2223         return &dm->ibdm;
2224
2225 err_dealloc:
2226         mlx5_cmd_dealloc_memic(memic, memic_addr,
2227                                act_size);
2228 err_free:
2229         kfree(dm);
2230         return ERR_PTR(err);
2231 }
2232
2233 int mlx5_ib_dealloc_dm(struct ib_dm *ibdm)
2234 {
2235         struct mlx5_memic *memic = &to_mdev(ibdm->device)->memic;
2236         struct mlx5_ib_dm *dm = to_mdm(ibdm);
2237         u64 act_size = roundup(dm->ibdm.length, MLX5_MEMIC_BASE_SIZE);
2238         u32 page_idx;
2239         int ret;
2240
2241         ret = mlx5_cmd_dealloc_memic(memic, dm->dev_addr, act_size);
2242         if (ret)
2243                 return ret;
2244
2245         page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) -
2246                     MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
2247                     PAGE_SHIFT;
2248         bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages,
2249                      page_idx,
2250                      DIV_ROUND_UP(act_size, PAGE_SIZE));
2251
2252         kfree(dm);
2253
2254         return 0;
2255 }
2256
2257 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
2258                                       struct ib_ucontext *context,
2259                                       struct ib_udata *udata)
2260 {
2261         struct mlx5_ib_alloc_pd_resp resp;
2262         struct mlx5_ib_pd *pd;
2263         int err;
2264         u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
2265         u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
2266         u16 uid = 0;
2267
2268         pd = kmalloc(sizeof(*pd), GFP_KERNEL);
2269         if (!pd)
2270                 return ERR_PTR(-ENOMEM);
2271
2272         uid = context ? to_mucontext(context)->devx_uid : 0;
2273         MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
2274         MLX5_SET(alloc_pd_in, in, uid, uid);
2275         err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
2276                             out, sizeof(out));
2277         if (err) {
2278                 kfree(pd);
2279                 return ERR_PTR(err);
2280         }
2281
2282         pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
2283         pd->uid = uid;
2284         if (context) {
2285                 resp.pdn = pd->pdn;
2286                 if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
2287                         mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
2288                         kfree(pd);
2289                         return ERR_PTR(-EFAULT);
2290                 }
2291         }
2292
2293         return &pd->ibpd;
2294 }
2295
2296 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
2297 {
2298         struct mlx5_ib_dev *mdev = to_mdev(pd->device);
2299         struct mlx5_ib_pd *mpd = to_mpd(pd);
2300
2301         mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
2302         kfree(mpd);
2303
2304         return 0;
2305 }
2306
2307 enum {
2308         MATCH_CRITERIA_ENABLE_OUTER_BIT,
2309         MATCH_CRITERIA_ENABLE_MISC_BIT,
2310         MATCH_CRITERIA_ENABLE_INNER_BIT,
2311         MATCH_CRITERIA_ENABLE_MISC2_BIT
2312 };
2313
2314 #define HEADER_IS_ZERO(match_criteria, headers)                            \
2315         !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
2316                     0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
2317
2318 static u8 get_match_criteria_enable(u32 *match_criteria)
2319 {
2320         u8 match_criteria_enable;
2321
2322         match_criteria_enable =
2323                 (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
2324                 MATCH_CRITERIA_ENABLE_OUTER_BIT;
2325         match_criteria_enable |=
2326                 (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
2327                 MATCH_CRITERIA_ENABLE_MISC_BIT;
2328         match_criteria_enable |=
2329                 (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
2330                 MATCH_CRITERIA_ENABLE_INNER_BIT;
2331         match_criteria_enable |=
2332                 (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
2333                 MATCH_CRITERIA_ENABLE_MISC2_BIT;
2334
2335         return match_criteria_enable;
2336 }
2337
2338 static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
2339 {
2340         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
2341         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
2342 }
2343
2344 static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
2345                            bool inner)
2346 {
2347         if (inner) {
2348                 MLX5_SET(fte_match_set_misc,
2349                          misc_c, inner_ipv6_flow_label, mask);
2350                 MLX5_SET(fte_match_set_misc,
2351                          misc_v, inner_ipv6_flow_label, val);
2352         } else {
2353                 MLX5_SET(fte_match_set_misc,
2354                          misc_c, outer_ipv6_flow_label, mask);
2355                 MLX5_SET(fte_match_set_misc,
2356                          misc_v, outer_ipv6_flow_label, val);
2357         }
2358 }
2359
2360 static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
2361 {
2362         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
2363         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
2364         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
2365         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
2366 }
2367
2368 static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
2369 {
2370         if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
2371             !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
2372                 return -EOPNOTSUPP;
2373
2374         if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
2375             !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
2376                 return -EOPNOTSUPP;
2377
2378         if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
2379             !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
2380                 return -EOPNOTSUPP;
2381
2382         if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
2383             !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
2384                 return -EOPNOTSUPP;
2385
2386         return 0;
2387 }
2388
2389 #define LAST_ETH_FIELD vlan_tag
2390 #define LAST_IB_FIELD sl
2391 #define LAST_IPV4_FIELD tos
2392 #define LAST_IPV6_FIELD traffic_class
2393 #define LAST_TCP_UDP_FIELD src_port
2394 #define LAST_TUNNEL_FIELD tunnel_id
2395 #define LAST_FLOW_TAG_FIELD tag_id
2396 #define LAST_DROP_FIELD size
2397 #define LAST_COUNTERS_FIELD counters
2398
2399 /* Field is the last supported field */
2400 #define FIELDS_NOT_SUPPORTED(filter, field)\
2401         memchr_inv((void *)&filter.field  +\
2402                    sizeof(filter.field), 0,\
2403                    sizeof(filter) -\
2404                    offsetof(typeof(filter), field) -\
2405                    sizeof(filter.field))
2406
2407 int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
2408                            bool is_egress,
2409                            struct mlx5_flow_act *action)
2410 {
2411
2412         switch (maction->ib_action.type) {
2413         case IB_FLOW_ACTION_ESP:
2414                 if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
2415                                       MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
2416                         return -EINVAL;
2417                 /* Currently only AES_GCM keymat is supported by the driver */
2418                 action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
2419                 action->action |= is_egress ?
2420                         MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
2421                         MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
2422                 return 0;
2423         case IB_FLOW_ACTION_UNSPECIFIED:
2424                 if (maction->flow_action_raw.sub_type ==
2425                     MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
2426                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
2427                                 return -EINVAL;
2428                         action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
2429                         action->modify_id = maction->flow_action_raw.action_id;
2430                         return 0;
2431                 }
2432                 if (maction->flow_action_raw.sub_type ==
2433                     MLX5_IB_FLOW_ACTION_DECAP) {
2434                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
2435                                 return -EINVAL;
2436                         action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
2437                         return 0;
2438                 }
2439                 if (maction->flow_action_raw.sub_type ==
2440                     MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
2441                         if (action->action &
2442                             MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
2443                                 return -EINVAL;
2444                         action->action |=
2445                                 MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
2446                         action->reformat_id =
2447                                 maction->flow_action_raw.action_id;
2448                         return 0;
2449                 }
2450                 /* fall through */
2451         default:
2452                 return -EOPNOTSUPP;
2453         }
2454 }
2455
2456 static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
2457                            u32 *match_v, const union ib_flow_spec *ib_spec,
2458                            const struct ib_flow_attr *flow_attr,
2459                            struct mlx5_flow_act *action, u32 prev_type)
2460 {
2461         void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
2462                                            misc_parameters);
2463         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
2464                                            misc_parameters);
2465         void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
2466                                             misc_parameters_2);
2467         void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
2468                                             misc_parameters_2);
2469         void *headers_c;
2470         void *headers_v;
2471         int match_ipv;
2472         int ret;
2473
2474         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
2475                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2476                                          inner_headers);
2477                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2478                                          inner_headers);
2479                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2480                                         ft_field_support.inner_ip_version);
2481         } else {
2482                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2483                                          outer_headers);
2484                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2485                                          outer_headers);
2486                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2487                                         ft_field_support.outer_ip_version);
2488         }
2489
2490         switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
2491         case IB_FLOW_SPEC_ETH:
2492                 if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
2493                         return -EOPNOTSUPP;
2494
2495                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2496                                              dmac_47_16),
2497                                 ib_spec->eth.mask.dst_mac);
2498                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2499                                              dmac_47_16),
2500                                 ib_spec->eth.val.dst_mac);
2501
2502                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2503                                              smac_47_16),
2504                                 ib_spec->eth.mask.src_mac);
2505                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2506                                              smac_47_16),
2507                                 ib_spec->eth.val.src_mac);
2508
2509                 if (ib_spec->eth.mask.vlan_tag) {
2510                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2511                                  cvlan_tag, 1);
2512                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2513                                  cvlan_tag, 1);
2514
2515                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2516                                  first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
2517                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2518                                  first_vid, ntohs(ib_spec->eth.val.vlan_tag));
2519
2520                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2521                                  first_cfi,
2522                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
2523                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2524                                  first_cfi,
2525                                  ntohs(ib_spec->eth.val.vlan_tag) >> 12);
2526
2527                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2528                                  first_prio,
2529                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
2530                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2531                                  first_prio,
2532                                  ntohs(ib_spec->eth.val.vlan_tag) >> 13);
2533                 }
2534                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2535                          ethertype, ntohs(ib_spec->eth.mask.ether_type));
2536                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2537                          ethertype, ntohs(ib_spec->eth.val.ether_type));
2538                 break;
2539         case IB_FLOW_SPEC_IPV4:
2540                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
2541                         return -EOPNOTSUPP;
2542
2543                 if (match_ipv) {
2544                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2545                                  ip_version, 0xf);
2546                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2547                                  ip_version, MLX5_FS_IPV4_VERSION);
2548                 } else {
2549                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2550                                  ethertype, 0xffff);
2551                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2552                                  ethertype, ETH_P_IP);
2553                 }
2554
2555                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2556                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2557                        &ib_spec->ipv4.mask.src_ip,
2558                        sizeof(ib_spec->ipv4.mask.src_ip));
2559                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2560                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2561                        &ib_spec->ipv4.val.src_ip,
2562                        sizeof(ib_spec->ipv4.val.src_ip));
2563                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2564                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2565                        &ib_spec->ipv4.mask.dst_ip,
2566                        sizeof(ib_spec->ipv4.mask.dst_ip));
2567                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2568                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2569                        &ib_spec->ipv4.val.dst_ip,
2570                        sizeof(ib_spec->ipv4.val.dst_ip));
2571
2572                 set_tos(headers_c, headers_v,
2573                         ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
2574
2575                 set_proto(headers_c, headers_v,
2576                           ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
2577                 break;
2578         case IB_FLOW_SPEC_IPV6:
2579                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
2580                         return -EOPNOTSUPP;
2581
2582                 if (match_ipv) {
2583                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2584                                  ip_version, 0xf);
2585                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2586                                  ip_version, MLX5_FS_IPV6_VERSION);
2587                 } else {
2588                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2589                                  ethertype, 0xffff);
2590                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2591                                  ethertype, ETH_P_IPV6);
2592                 }
2593
2594                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2595                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2596                        &ib_spec->ipv6.mask.src_ip,
2597                        sizeof(ib_spec->ipv6.mask.src_ip));
2598                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2599                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2600                        &ib_spec->ipv6.val.src_ip,
2601                        sizeof(ib_spec->ipv6.val.src_ip));
2602                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2603                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2604                        &ib_spec->ipv6.mask.dst_ip,
2605                        sizeof(ib_spec->ipv6.mask.dst_ip));
2606                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2607                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2608                        &ib_spec->ipv6.val.dst_ip,
2609                        sizeof(ib_spec->ipv6.val.dst_ip));
2610
2611                 set_tos(headers_c, headers_v,
2612                         ib_spec->ipv6.mask.traffic_class,
2613                         ib_spec->ipv6.val.traffic_class);
2614
2615                 set_proto(headers_c, headers_v,
2616                           ib_spec->ipv6.mask.next_hdr,
2617                           ib_spec->ipv6.val.next_hdr);
2618
2619                 set_flow_label(misc_params_c, misc_params_v,
2620                                ntohl(ib_spec->ipv6.mask.flow_label),
2621                                ntohl(ib_spec->ipv6.val.flow_label),
2622                                ib_spec->type & IB_FLOW_SPEC_INNER);
2623                 break;
2624         case IB_FLOW_SPEC_ESP:
2625                 if (ib_spec->esp.mask.seq)
2626                         return -EOPNOTSUPP;
2627
2628                 MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
2629                          ntohl(ib_spec->esp.mask.spi));
2630                 MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
2631                          ntohl(ib_spec->esp.val.spi));
2632                 break;
2633         case IB_FLOW_SPEC_TCP:
2634                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2635                                          LAST_TCP_UDP_FIELD))
2636                         return -EOPNOTSUPP;
2637
2638                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
2639                          0xff);
2640                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
2641                          IPPROTO_TCP);
2642
2643                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
2644                          ntohs(ib_spec->tcp_udp.mask.src_port));
2645                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
2646                          ntohs(ib_spec->tcp_udp.val.src_port));
2647
2648                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
2649                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2650                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
2651                          ntohs(ib_spec->tcp_udp.val.dst_port));
2652                 break;
2653         case IB_FLOW_SPEC_UDP:
2654                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2655                                          LAST_TCP_UDP_FIELD))
2656                         return -EOPNOTSUPP;
2657
2658                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
2659                          0xff);
2660                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
2661                          IPPROTO_UDP);
2662
2663                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
2664                          ntohs(ib_spec->tcp_udp.mask.src_port));
2665                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
2666                          ntohs(ib_spec->tcp_udp.val.src_port));
2667
2668                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
2669                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2670                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
2671                          ntohs(ib_spec->tcp_udp.val.dst_port));
2672                 break;
2673         case IB_FLOW_SPEC_GRE:
2674                 if (ib_spec->gre.mask.c_ks_res0_ver)
2675                         return -EOPNOTSUPP;
2676
2677                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
2678                          0xff);
2679                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
2680                          IPPROTO_GRE);
2681
2682                 MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
2683                          ntohs(ib_spec->gre.mask.protocol));
2684                 MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
2685                          ntohs(ib_spec->gre.val.protocol));
2686
2687                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
2688                                     gre_key.nvgre.hi),
2689                        &ib_spec->gre.mask.key,
2690                        sizeof(ib_spec->gre.mask.key));
2691                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
2692                                     gre_key.nvgre.hi),
2693                        &ib_spec->gre.val.key,
2694                        sizeof(ib_spec->gre.val.key));
2695                 break;
2696         case IB_FLOW_SPEC_MPLS:
2697                 switch (prev_type) {
2698                 case IB_FLOW_SPEC_UDP:
2699                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2700                                                    ft_field_support.outer_first_mpls_over_udp),
2701                                                    &ib_spec->mpls.mask.tag))
2702                                 return -EOPNOTSUPP;
2703
2704                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2705                                             outer_first_mpls_over_udp),
2706                                &ib_spec->mpls.val.tag,
2707                                sizeof(ib_spec->mpls.val.tag));
2708                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2709                                             outer_first_mpls_over_udp),
2710                                &ib_spec->mpls.mask.tag,
2711                                sizeof(ib_spec->mpls.mask.tag));
2712                         break;
2713                 case IB_FLOW_SPEC_GRE:
2714                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2715                                                    ft_field_support.outer_first_mpls_over_gre),
2716                                                    &ib_spec->mpls.mask.tag))
2717                                 return -EOPNOTSUPP;
2718
2719                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2720                                             outer_first_mpls_over_gre),
2721                                &ib_spec->mpls.val.tag,
2722                                sizeof(ib_spec->mpls.val.tag));
2723                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2724                                             outer_first_mpls_over_gre),
2725                                &ib_spec->mpls.mask.tag,
2726                                sizeof(ib_spec->mpls.mask.tag));
2727                         break;
2728                 default:
2729                         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
2730                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2731                                                            ft_field_support.inner_first_mpls),
2732                                                            &ib_spec->mpls.mask.tag))
2733                                         return -EOPNOTSUPP;
2734
2735                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2736                                                     inner_first_mpls),
2737                                        &ib_spec->mpls.val.tag,
2738                                        sizeof(ib_spec->mpls.val.tag));
2739                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2740                                                     inner_first_mpls),
2741                                        &ib_spec->mpls.mask.tag,
2742                                        sizeof(ib_spec->mpls.mask.tag));
2743                         } else {
2744                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2745                                                            ft_field_support.outer_first_mpls),
2746                                                            &ib_spec->mpls.mask.tag))
2747                                         return -EOPNOTSUPP;
2748
2749                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2750                                                     outer_first_mpls),
2751                                        &ib_spec->mpls.val.tag,
2752                                        sizeof(ib_spec->mpls.val.tag));
2753                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2754                                                     outer_first_mpls),
2755                                        &ib_spec->mpls.mask.tag,
2756                                        sizeof(ib_spec->mpls.mask.tag));
2757                         }
2758                 }
2759                 break;
2760         case IB_FLOW_SPEC_VXLAN_TUNNEL:
2761                 if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
2762                                          LAST_TUNNEL_FIELD))
2763                         return -EOPNOTSUPP;
2764
2765                 MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
2766                          ntohl(ib_spec->tunnel.mask.tunnel_id));
2767                 MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
2768                          ntohl(ib_spec->tunnel.val.tunnel_id));
2769                 break;
2770         case IB_FLOW_SPEC_ACTION_TAG:
2771                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
2772                                          LAST_FLOW_TAG_FIELD))
2773                         return -EOPNOTSUPP;
2774                 if (ib_spec->flow_tag.tag_id >= BIT(24))
2775                         return -EINVAL;
2776
2777                 action->flow_tag = ib_spec->flow_tag.tag_id;
2778                 action->flags |= FLOW_ACT_HAS_TAG;
2779                 break;
2780         case IB_FLOW_SPEC_ACTION_DROP:
2781                 if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
2782                                          LAST_DROP_FIELD))
2783                         return -EOPNOTSUPP;
2784                 action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
2785                 break;
2786         case IB_FLOW_SPEC_ACTION_HANDLE:
2787                 ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
2788                         flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
2789                 if (ret)
2790                         return ret;
2791                 break;
2792         case IB_FLOW_SPEC_ACTION_COUNT:
2793                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
2794                                          LAST_COUNTERS_FIELD))
2795                         return -EOPNOTSUPP;
2796
2797                 /* for now support only one counters spec per flow */
2798                 if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
2799                         return -EINVAL;
2800
2801                 action->counters = ib_spec->flow_count.counters;
2802                 action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
2803                 break;
2804         default:
2805                 return -EINVAL;
2806         }
2807
2808         return 0;
2809 }
2810
2811 /* If a flow could catch both multicast and unicast packets,
2812  * it won't fall into the multicast flow steering table and this rule
2813  * could steal other multicast packets.
2814  */
2815 static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
2816 {
2817         union ib_flow_spec *flow_spec;
2818
2819         if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
2820             ib_attr->num_of_specs < 1)
2821                 return false;
2822
2823         flow_spec = (union ib_flow_spec *)(ib_attr + 1);
2824         if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
2825                 struct ib_flow_spec_ipv4 *ipv4_spec;
2826
2827                 ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
2828                 if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
2829                         return true;
2830
2831                 return false;
2832         }
2833
2834         if (flow_spec->type == IB_FLOW_SPEC_ETH) {
2835                 struct ib_flow_spec_eth *eth_spec;
2836
2837                 eth_spec = (struct ib_flow_spec_eth *)flow_spec;
2838                 return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
2839                        is_multicast_ether_addr(eth_spec->val.dst_mac);
2840         }
2841
2842         return false;
2843 }
2844
2845 enum valid_spec {
2846         VALID_SPEC_INVALID,
2847         VALID_SPEC_VALID,
2848         VALID_SPEC_NA,
2849 };
2850
2851 static enum valid_spec
2852 is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
2853                      const struct mlx5_flow_spec *spec,
2854                      const struct mlx5_flow_act *flow_act,
2855                      bool egress)
2856 {
2857         const u32 *match_c = spec->match_criteria;
2858         bool is_crypto =
2859                 (flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
2860                                      MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
2861         bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
2862         bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
2863
2864         /*
2865          * Currently only crypto is supported in egress, when regular egress
2866          * rules would be supported, always return VALID_SPEC_NA.
2867          */
2868         if (!is_crypto)
2869                 return VALID_SPEC_NA;
2870
2871         return is_crypto && is_ipsec &&
2872                 (!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ?
2873                 VALID_SPEC_VALID : VALID_SPEC_INVALID;
2874 }
2875
2876 static bool is_valid_spec(struct mlx5_core_dev *mdev,
2877                           const struct mlx5_flow_spec *spec,
2878                           const struct mlx5_flow_act *flow_act,
2879                           bool egress)
2880 {
2881         /* We curretly only support ipsec egress flow */
2882         return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
2883 }
2884
2885 static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
2886                                const struct ib_flow_attr *flow_attr,
2887                                bool check_inner)
2888 {
2889         union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
2890         int match_ipv = check_inner ?
2891                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2892                                         ft_field_support.inner_ip_version) :
2893                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2894                                         ft_field_support.outer_ip_version);
2895         int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
2896         bool ipv4_spec_valid, ipv6_spec_valid;
2897         unsigned int ip_spec_type = 0;
2898         bool has_ethertype = false;
2899         unsigned int spec_index;
2900         bool mask_valid = true;
2901         u16 eth_type = 0;
2902         bool type_valid;
2903
2904         /* Validate that ethertype is correct */
2905         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
2906                 if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
2907                     ib_spec->eth.mask.ether_type) {
2908                         mask_valid = (ib_spec->eth.mask.ether_type ==
2909                                       htons(0xffff));
2910                         has_ethertype = true;
2911                         eth_type = ntohs(ib_spec->eth.val.ether_type);
2912                 } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
2913                            (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
2914                         ip_spec_type = ib_spec->type;
2915                 }
2916                 ib_spec = (void *)ib_spec + ib_spec->size;
2917         }
2918
2919         type_valid = (!has_ethertype) || (!ip_spec_type);
2920         if (!type_valid && mask_valid) {
2921                 ipv4_spec_valid = (eth_type == ETH_P_IP) &&
2922                         (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
2923                 ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
2924                         (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
2925
2926                 type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
2927                              (((eth_type == ETH_P_MPLS_UC) ||
2928                                (eth_type == ETH_P_MPLS_MC)) && match_ipv);
2929         }
2930
2931         return type_valid;
2932 }
2933
2934 static bool is_valid_attr(struct mlx5_core_dev *mdev,
2935                           const struct ib_flow_attr *flow_attr)
2936 {
2937         return is_valid_ethertype(mdev, flow_attr, false) &&
2938                is_valid_ethertype(mdev, flow_attr, true);
2939 }
2940
2941 static void put_flow_table(struct mlx5_ib_dev *dev,
2942                            struct mlx5_ib_flow_prio *prio, bool ft_added)
2943 {
2944         prio->refcount -= !!ft_added;
2945         if (!prio->refcount) {
2946                 mlx5_destroy_flow_table(prio->flow_table);
2947                 prio->flow_table = NULL;
2948         }
2949 }
2950
2951 static void counters_clear_description(struct ib_counters *counters)
2952 {
2953         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
2954
2955         mutex_lock(&mcounters->mcntrs_mutex);
2956         kfree(mcounters->counters_data);
2957         mcounters->counters_data = NULL;
2958         mcounters->cntrs_max_index = 0;
2959         mutex_unlock(&mcounters->mcntrs_mutex);
2960 }
2961
2962 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
2963 {
2964         struct mlx5_ib_flow_handler *handler = container_of(flow_id,
2965                                                           struct mlx5_ib_flow_handler,
2966                                                           ibflow);
2967         struct mlx5_ib_flow_handler *iter, *tmp;
2968         struct mlx5_ib_dev *dev = handler->dev;
2969
2970         mutex_lock(&dev->flow_db->lock);
2971
2972         list_for_each_entry_safe(iter, tmp, &handler->list, list) {
2973                 mlx5_del_flow_rules(iter->rule);
2974                 put_flow_table(dev, iter->prio, true);
2975                 list_del(&iter->list);
2976                 kfree(iter);
2977         }
2978
2979         mlx5_del_flow_rules(handler->rule);
2980         put_flow_table(dev, handler->prio, true);
2981         if (handler->ibcounters &&
2982             atomic_read(&handler->ibcounters->usecnt) == 1)
2983                 counters_clear_description(handler->ibcounters);
2984
2985         mutex_unlock(&dev->flow_db->lock);
2986         if (handler->flow_matcher)
2987                 atomic_dec(&handler->flow_matcher->usecnt);
2988         kfree(handler);
2989
2990         return 0;
2991 }
2992
2993 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
2994 {
2995         priority *= 2;
2996         if (!dont_trap)
2997                 priority++;
2998         return priority;
2999 }
3000
3001 enum flow_table_type {
3002         MLX5_IB_FT_RX,
3003         MLX5_IB_FT_TX
3004 };
3005
3006 #define MLX5_FS_MAX_TYPES        6
3007 #define MLX5_FS_MAX_ENTRIES      BIT(16)
3008
3009 static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
3010                                            struct mlx5_ib_flow_prio *prio,
3011                                            int priority,
3012                                            int num_entries, int num_groups,
3013                                            u32 flags)
3014 {
3015         struct mlx5_flow_table *ft;
3016
3017         ft = mlx5_create_auto_grouped_flow_table(ns, priority,
3018                                                  num_entries,
3019                                                  num_groups,
3020                                                  0, flags);
3021         if (IS_ERR(ft))
3022                 return ERR_CAST(ft);
3023
3024         prio->flow_table = ft;
3025         prio->refcount = 0;
3026         return prio;
3027 }
3028
3029 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
3030                                                 struct ib_flow_attr *flow_attr,
3031                                                 enum flow_table_type ft_type)
3032 {
3033         bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
3034         struct mlx5_flow_namespace *ns = NULL;
3035         struct mlx5_ib_flow_prio *prio;
3036         struct mlx5_flow_table *ft;
3037         int max_table_size;
3038         int num_entries;
3039         int num_groups;
3040         u32 flags = 0;
3041         int priority;
3042
3043         max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3044                                                        log_max_ft_size));
3045         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3046                 enum mlx5_flow_namespace_type fn_type;
3047
3048                 if (flow_is_multicast_only(flow_attr) &&
3049                     !dont_trap)
3050                         priority = MLX5_IB_FLOW_MCAST_PRIO;
3051                 else
3052                         priority = ib_prio_to_core_prio(flow_attr->priority,
3053                                                         dont_trap);
3054                 if (ft_type == MLX5_IB_FT_RX) {
3055                         fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
3056                         prio = &dev->flow_db->prios[priority];
3057                         if (!dev->rep &&
3058                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
3059                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
3060                         if (!dev->rep &&
3061                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3062                                         reformat_l3_tunnel_to_l2))
3063                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3064                 } else {
3065                         max_table_size =
3066                                 BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
3067                                                               log_max_ft_size));
3068                         fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
3069                         prio = &dev->flow_db->egress_prios[priority];
3070                         if (!dev->rep &&
3071                             MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
3072                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3073                 }
3074                 ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
3075                 num_entries = MLX5_FS_MAX_ENTRIES;
3076                 num_groups = MLX5_FS_MAX_TYPES;
3077         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3078                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3079                 ns = mlx5_get_flow_namespace(dev->mdev,
3080                                              MLX5_FLOW_NAMESPACE_LEFTOVERS);
3081                 build_leftovers_ft_param(&priority,
3082                                          &num_entries,
3083                                          &num_groups);
3084                 prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
3085         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3086                 if (!MLX5_CAP_FLOWTABLE(dev->mdev,
3087                                         allow_sniffer_and_nic_rx_shared_tir))
3088                         return ERR_PTR(-ENOTSUPP);
3089
3090                 ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
3091                                              MLX5_FLOW_NAMESPACE_SNIFFER_RX :
3092                                              MLX5_FLOW_NAMESPACE_SNIFFER_TX);
3093
3094                 prio = &dev->flow_db->sniffer[ft_type];
3095                 priority = 0;
3096                 num_entries = 1;
3097                 num_groups = 1;
3098         }
3099
3100         if (!ns)
3101                 return ERR_PTR(-ENOTSUPP);
3102
3103         if (num_entries > max_table_size)
3104                 return ERR_PTR(-ENOMEM);
3105
3106         ft = prio->flow_table;
3107         if (!ft)
3108                 return _get_prio(ns, prio, priority, num_entries, num_groups,
3109                                  flags);
3110
3111         return prio;
3112 }
3113
3114 static void set_underlay_qp(struct mlx5_ib_dev *dev,
3115                             struct mlx5_flow_spec *spec,
3116                             u32 underlay_qpn)
3117 {
3118         void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
3119                                            spec->match_criteria,
3120                                            misc_parameters);
3121         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3122                                            misc_parameters);
3123
3124         if (underlay_qpn &&
3125             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3126                                       ft_field_support.bth_dst_qp)) {
3127                 MLX5_SET(fte_match_set_misc,
3128                          misc_params_v, bth_dst_qp, underlay_qpn);
3129                 MLX5_SET(fte_match_set_misc,
3130                          misc_params_c, bth_dst_qp, 0xffffff);
3131         }
3132 }
3133
3134 static int read_flow_counters(struct ib_device *ibdev,
3135                               struct mlx5_read_counters_attr *read_attr)
3136 {
3137         struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
3138         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3139
3140         return mlx5_fc_query(dev->mdev, fc,
3141                              &read_attr->out[IB_COUNTER_PACKETS],
3142                              &read_attr->out[IB_COUNTER_BYTES]);
3143 }
3144
3145 /* flow counters currently expose two counters packets and bytes */
3146 #define FLOW_COUNTERS_NUM 2
3147 static int counters_set_description(struct ib_counters *counters,
3148                                     enum mlx5_ib_counters_type counters_type,
3149                                     struct mlx5_ib_flow_counters_desc *desc_data,
3150                                     u32 ncounters)
3151 {
3152         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
3153         u32 cntrs_max_index = 0;
3154         int i;
3155
3156         if (counters_type != MLX5_IB_COUNTERS_FLOW)
3157                 return -EINVAL;
3158
3159         /* init the fields for the object */
3160         mcounters->type = counters_type;
3161         mcounters->read_counters = read_flow_counters;
3162         mcounters->counters_num = FLOW_COUNTERS_NUM;
3163         mcounters->ncounters = ncounters;
3164         /* each counter entry have both description and index pair */
3165         for (i = 0; i < ncounters; i++) {
3166                 if (desc_data[i].description > IB_COUNTER_BYTES)
3167                         return -EINVAL;
3168
3169                 if (cntrs_max_index <= desc_data[i].index)
3170                         cntrs_max_index = desc_data[i].index + 1;
3171         }
3172
3173         mutex_lock(&mcounters->mcntrs_mutex);
3174         mcounters->counters_data = desc_data;
3175         mcounters->cntrs_max_index = cntrs_max_index;
3176         mutex_unlock(&mcounters->mcntrs_mutex);
3177
3178         return 0;
3179 }
3180
3181 #define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
3182 static int flow_counters_set_data(struct ib_counters *ibcounters,
3183                                   struct mlx5_ib_create_flow *ucmd)
3184 {
3185         struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
3186         struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
3187         struct mlx5_ib_flow_counters_desc *desc_data = NULL;
3188         bool hw_hndl = false;
3189         int ret = 0;
3190
3191         if (ucmd && ucmd->ncounters_data != 0) {
3192                 cntrs_data = ucmd->data;
3193                 if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
3194                         return -EINVAL;
3195
3196                 desc_data = kcalloc(cntrs_data->ncounters,
3197                                     sizeof(*desc_data),
3198                                     GFP_KERNEL);
3199                 if (!desc_data)
3200                         return  -ENOMEM;
3201
3202                 if (copy_from_user(desc_data,
3203                                    u64_to_user_ptr(cntrs_data->counters_data),
3204                                    sizeof(*desc_data) * cntrs_data->ncounters)) {
3205                         ret = -EFAULT;
3206                         goto free;
3207                 }
3208         }
3209
3210         if (!mcounters->hw_cntrs_hndl) {
3211                 mcounters->hw_cntrs_hndl = mlx5_fc_create(
3212                         to_mdev(ibcounters->device)->mdev, false);
3213                 if (IS_ERR(mcounters->hw_cntrs_hndl)) {
3214                         ret = PTR_ERR(mcounters->hw_cntrs_hndl);
3215                         goto free;
3216                 }
3217                 hw_hndl = true;
3218         }
3219
3220         if (desc_data) {
3221                 /* counters already bound to at least one flow */
3222                 if (mcounters->cntrs_max_index) {
3223                         ret = -EINVAL;
3224                         goto free_hndl;
3225                 }
3226
3227                 ret = counters_set_description(ibcounters,
3228                                                MLX5_IB_COUNTERS_FLOW,
3229                                                desc_data,
3230                                                cntrs_data->ncounters);
3231                 if (ret)
3232                         goto free_hndl;
3233
3234         } else if (!mcounters->cntrs_max_index) {
3235                 /* counters not bound yet, must have udata passed */
3236                 ret = -EINVAL;
3237                 goto free_hndl;
3238         }
3239
3240         return 0;
3241
3242 free_hndl:
3243         if (hw_hndl) {
3244                 mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
3245                                 mcounters->hw_cntrs_hndl);
3246                 mcounters->hw_cntrs_hndl = NULL;
3247         }
3248 free:
3249         kfree(desc_data);
3250         return ret;
3251 }
3252
3253 static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
3254                                                       struct mlx5_ib_flow_prio *ft_prio,
3255                                                       const struct ib_flow_attr *flow_attr,
3256                                                       struct mlx5_flow_destination *dst,
3257                                                       u32 underlay_qpn,
3258                                                       struct mlx5_ib_create_flow *ucmd)
3259 {
3260         struct mlx5_flow_table  *ft = ft_prio->flow_table;
3261         struct mlx5_ib_flow_handler *handler;
3262         struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
3263         struct mlx5_flow_spec *spec;
3264         struct mlx5_flow_destination dest_arr[2] = {};
3265         struct mlx5_flow_destination *rule_dst = dest_arr;
3266         const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
3267         unsigned int spec_index;
3268         u32 prev_type = 0;
3269         int err = 0;
3270         int dest_num = 0;
3271         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3272
3273         if (!is_valid_attr(dev->mdev, flow_attr))
3274                 return ERR_PTR(-EINVAL);
3275
3276         if (dev->rep && is_egress)
3277                 return ERR_PTR(-EINVAL);
3278
3279         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
3280         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
3281         if (!handler || !spec) {
3282                 err = -ENOMEM;
3283                 goto free;
3284         }
3285
3286         INIT_LIST_HEAD(&handler->list);
3287         if (dst) {
3288                 memcpy(&dest_arr[0], dst, sizeof(*dst));
3289                 dest_num++;
3290         }
3291
3292         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
3293                 err = parse_flow_attr(dev->mdev, spec->match_criteria,
3294                                       spec->match_value,
3295                                       ib_flow, flow_attr, &flow_act,
3296                                       prev_type);
3297                 if (err < 0)
3298                         goto free;
3299
3300                 prev_type = ((union ib_flow_spec *)ib_flow)->type;
3301                 ib_flow += ((union ib_flow_spec *)ib_flow)->size;
3302         }
3303
3304         if (!flow_is_multicast_only(flow_attr))
3305                 set_underlay_qp(dev, spec, underlay_qpn);
3306
3307         if (dev->rep) {
3308                 void *misc;
3309
3310                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3311                                     misc_parameters);
3312                 MLX5_SET(fte_match_set_misc, misc, source_port,
3313                          dev->rep->vport);
3314                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
3315                                     misc_parameters);
3316                 MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
3317         }
3318
3319         spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
3320
3321         if (is_egress &&
3322             !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
3323                 err = -EINVAL;
3324                 goto free;
3325         }
3326
3327         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
3328                 struct mlx5_ib_mcounters *mcounters;
3329
3330                 err = flow_counters_set_data(flow_act.counters, ucmd);
3331                 if (err)
3332                         goto free;
3333
3334                 mcounters = to_mcounters(flow_act.counters);
3335                 handler->ibcounters = flow_act.counters;
3336                 dest_arr[dest_num].type =
3337                         MLX5_FLOW_DESTINATION_TYPE_COUNTER;
3338                 dest_arr[dest_num].counter_id =
3339                         mlx5_fc_id(mcounters->hw_cntrs_hndl);
3340                 dest_num++;
3341         }
3342
3343         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
3344                 if (!(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT)) {
3345                         rule_dst = NULL;
3346                         dest_num = 0;
3347                 }
3348         } else {
3349                 if (is_egress)
3350                         flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
3351                 else
3352                         flow_act.action |=
3353                                 dest_num ?  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
3354                                         MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
3355         }
3356
3357         if ((flow_act.flags & FLOW_ACT_HAS_TAG)  &&
3358             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3359              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3360                 mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
3361                              flow_act.flow_tag, flow_attr->type);
3362                 err = -EINVAL;
3363                 goto free;
3364         }
3365         handler->rule = mlx5_add_flow_rules(ft, spec,
3366                                             &flow_act,
3367                                             rule_dst, dest_num);
3368
3369         if (IS_ERR(handler->rule)) {
3370                 err = PTR_ERR(handler->rule);
3371                 goto free;
3372         }
3373
3374         ft_prio->refcount++;
3375         handler->prio = ft_prio;
3376         handler->dev = dev;
3377
3378         ft_prio->flow_table = ft;
3379 free:
3380         if (err && handler) {
3381                 if (handler->ibcounters &&
3382                     atomic_read(&handler->ibcounters->usecnt) == 1)
3383                         counters_clear_description(handler->ibcounters);
3384                 kfree(handler);
3385         }
3386         kvfree(spec);
3387         return err ? ERR_PTR(err) : handler;
3388 }
3389
3390 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
3391                                                      struct mlx5_ib_flow_prio *ft_prio,
3392                                                      const struct ib_flow_attr *flow_attr,
3393                                                      struct mlx5_flow_destination *dst)
3394 {
3395         return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
3396 }
3397
3398 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
3399                                                           struct mlx5_ib_flow_prio *ft_prio,
3400                                                           struct ib_flow_attr *flow_attr,
3401                                                           struct mlx5_flow_destination *dst)
3402 {
3403         struct mlx5_ib_flow_handler *handler_dst = NULL;
3404         struct mlx5_ib_flow_handler *handler = NULL;
3405
3406         handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
3407         if (!IS_ERR(handler)) {
3408                 handler_dst = create_flow_rule(dev, ft_prio,
3409                                                flow_attr, dst);
3410                 if (IS_ERR(handler_dst)) {
3411                         mlx5_del_flow_rules(handler->rule);
3412                         ft_prio->refcount--;
3413                         kfree(handler);
3414                         handler = handler_dst;
3415                 } else {
3416                         list_add(&handler_dst->list, &handler->list);
3417                 }
3418         }
3419
3420         return handler;
3421 }
3422 enum {
3423         LEFTOVERS_MC,
3424         LEFTOVERS_UC,
3425 };
3426
3427 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
3428                                                           struct mlx5_ib_flow_prio *ft_prio,
3429                                                           struct ib_flow_attr *flow_attr,
3430                                                           struct mlx5_flow_destination *dst)
3431 {
3432         struct mlx5_ib_flow_handler *handler_ucast = NULL;
3433         struct mlx5_ib_flow_handler *handler = NULL;
3434
3435         static struct {
3436                 struct ib_flow_attr     flow_attr;
3437                 struct ib_flow_spec_eth eth_flow;
3438         } leftovers_specs[] = {
3439                 [LEFTOVERS_MC] = {
3440                         .flow_attr = {
3441                                 .num_of_specs = 1,
3442                                 .size = sizeof(leftovers_specs[0])
3443                         },
3444                         .eth_flow = {
3445                                 .type = IB_FLOW_SPEC_ETH,
3446                                 .size = sizeof(struct ib_flow_spec_eth),
3447                                 .mask = {.dst_mac = {0x1} },
3448                                 .val =  {.dst_mac = {0x1} }
3449                         }
3450                 },
3451                 [LEFTOVERS_UC] = {
3452                         .flow_attr = {
3453                                 .num_of_specs = 1,
3454                                 .size = sizeof(leftovers_specs[0])
3455                         },
3456                         .eth_flow = {
3457                                 .type = IB_FLOW_SPEC_ETH,
3458                                 .size = sizeof(struct ib_flow_spec_eth),
3459                                 .mask = {.dst_mac = {0x1} },
3460                                 .val = {.dst_mac = {} }
3461                         }
3462                 }
3463         };
3464
3465         handler = create_flow_rule(dev, ft_prio,
3466                                    &leftovers_specs[LEFTOVERS_MC].flow_attr,
3467                                    dst);
3468         if (!IS_ERR(handler) &&
3469             flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
3470                 handler_ucast = create_flow_rule(dev, ft_prio,
3471                                                  &leftovers_specs[LEFTOVERS_UC].flow_attr,
3472                                                  dst);
3473                 if (IS_ERR(handler_ucast)) {
3474                         mlx5_del_flow_rules(handler->rule);
3475                         ft_prio->refcount--;
3476                         kfree(handler);
3477                         handler = handler_ucast;
3478                 } else {
3479                         list_add(&handler_ucast->list, &handler->list);
3480                 }
3481         }
3482
3483         return handler;
3484 }
3485
3486 static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
3487                                                         struct mlx5_ib_flow_prio *ft_rx,
3488                                                         struct mlx5_ib_flow_prio *ft_tx,
3489                                                         struct mlx5_flow_destination *dst)
3490 {
3491         struct mlx5_ib_flow_handler *handler_rx;
3492         struct mlx5_ib_flow_handler *handler_tx;
3493         int err;
3494         static const struct ib_flow_attr flow_attr  = {
3495                 .num_of_specs = 0,
3496                 .size = sizeof(flow_attr)
3497         };
3498
3499         handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
3500         if (IS_ERR(handler_rx)) {
3501                 err = PTR_ERR(handler_rx);
3502                 goto err;
3503         }
3504
3505         handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
3506         if (IS_ERR(handler_tx)) {
3507                 err = PTR_ERR(handler_tx);
3508                 goto err_tx;
3509         }
3510
3511         list_add(&handler_tx->list, &handler_rx->list);
3512
3513         return handler_rx;
3514
3515 err_tx:
3516         mlx5_del_flow_rules(handler_rx->rule);
3517         ft_rx->refcount--;
3518         kfree(handler_rx);
3519 err:
3520         return ERR_PTR(err);
3521 }
3522
3523 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
3524                                            struct ib_flow_attr *flow_attr,
3525                                            int domain,
3526                                            struct ib_udata *udata)
3527 {
3528         struct mlx5_ib_dev *dev = to_mdev(qp->device);
3529         struct mlx5_ib_qp *mqp = to_mqp(qp);
3530         struct mlx5_ib_flow_handler *handler = NULL;
3531         struct mlx5_flow_destination *dst = NULL;
3532         struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
3533         struct mlx5_ib_flow_prio *ft_prio;
3534         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3535         struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
3536         size_t min_ucmd_sz, required_ucmd_sz;
3537         int err;
3538         int underlay_qpn;
3539
3540         if (udata && udata->inlen) {
3541                 min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
3542                                 sizeof(ucmd_hdr.reserved);
3543                 if (udata->inlen < min_ucmd_sz)
3544                         return ERR_PTR(-EOPNOTSUPP);
3545
3546                 err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
3547                 if (err)
3548                         return ERR_PTR(err);
3549
3550                 /* currently supports only one counters data */
3551                 if (ucmd_hdr.ncounters_data > 1)
3552                         return ERR_PTR(-EINVAL);
3553
3554                 required_ucmd_sz = min_ucmd_sz +
3555                         sizeof(struct mlx5_ib_flow_counters_data) *
3556                         ucmd_hdr.ncounters_data;
3557                 if (udata->inlen > required_ucmd_sz &&
3558                     !ib_is_udata_cleared(udata, required_ucmd_sz,
3559                                          udata->inlen - required_ucmd_sz))
3560                         return ERR_PTR(-EOPNOTSUPP);
3561
3562                 ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
3563                 if (!ucmd)
3564                         return ERR_PTR(-ENOMEM);
3565
3566                 err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
3567                 if (err)
3568                         goto free_ucmd;
3569         }
3570
3571         if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
3572                 err = -ENOMEM;
3573                 goto free_ucmd;
3574         }
3575
3576         if (domain != IB_FLOW_DOMAIN_USER ||
3577             flow_attr->port > dev->num_ports ||
3578             (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
3579                                   IB_FLOW_ATTR_FLAGS_EGRESS))) {
3580                 err = -EINVAL;
3581                 goto free_ucmd;
3582         }
3583
3584         if (is_egress &&
3585             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3586              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3587                 err = -EINVAL;
3588                 goto free_ucmd;
3589         }
3590
3591         dst = kzalloc(sizeof(*dst), GFP_KERNEL);
3592         if (!dst) {
3593                 err = -ENOMEM;
3594                 goto free_ucmd;
3595         }
3596
3597         mutex_lock(&dev->flow_db->lock);
3598
3599         ft_prio = get_flow_table(dev, flow_attr,
3600                                  is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
3601         if (IS_ERR(ft_prio)) {
3602                 err = PTR_ERR(ft_prio);
3603                 goto unlock;
3604         }
3605         if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3606                 ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
3607                 if (IS_ERR(ft_prio_tx)) {
3608                         err = PTR_ERR(ft_prio_tx);
3609                         ft_prio_tx = NULL;
3610                         goto destroy_ft;
3611                 }
3612         }
3613
3614         if (is_egress) {
3615                 dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
3616         } else {
3617                 dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
3618                 if (mqp->flags & MLX5_IB_QP_RSS)
3619                         dst->tir_num = mqp->rss_qp.tirn;
3620                 else
3621                         dst->tir_num = mqp->raw_packet_qp.rq.tirn;
3622         }
3623
3624         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3625                 if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
3626                         handler = create_dont_trap_rule(dev, ft_prio,
3627                                                         flow_attr, dst);
3628                 } else {
3629                         underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
3630                                         mqp->underlay_qpn : 0;
3631                         handler = _create_flow_rule(dev, ft_prio, flow_attr,
3632                                                     dst, underlay_qpn, ucmd);
3633                 }
3634         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3635                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3636                 handler = create_leftovers_rule(dev, ft_prio, flow_attr,
3637                                                 dst);
3638         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3639                 handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
3640         } else {
3641                 err = -EINVAL;
3642                 goto destroy_ft;
3643         }
3644
3645         if (IS_ERR(handler)) {
3646                 err = PTR_ERR(handler);
3647                 handler = NULL;
3648                 goto destroy_ft;
3649         }
3650
3651         mutex_unlock(&dev->flow_db->lock);
3652         kfree(dst);
3653         kfree(ucmd);
3654
3655         return &handler->ibflow;
3656
3657 destroy_ft:
3658         put_flow_table(dev, ft_prio, false);
3659         if (ft_prio_tx)
3660                 put_flow_table(dev, ft_prio_tx, false);
3661 unlock:
3662         mutex_unlock(&dev->flow_db->lock);
3663         kfree(dst);
3664 free_ucmd:
3665         kfree(ucmd);
3666         return ERR_PTR(err);
3667 }
3668
3669 static struct mlx5_ib_flow_prio *
3670 _get_flow_table(struct mlx5_ib_dev *dev,
3671                 struct mlx5_ib_flow_matcher *fs_matcher,
3672                 bool mcast)
3673 {
3674         struct mlx5_flow_namespace *ns = NULL;
3675         struct mlx5_ib_flow_prio *prio;
3676         int max_table_size;
3677         u32 flags = 0;
3678         int priority;
3679
3680         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
3681                 max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3682                                         log_max_ft_size));
3683                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
3684                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
3685                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3686                                               reformat_l3_tunnel_to_l2))
3687                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3688         } else { /* Can only be MLX5_FLOW_NAMESPACE_EGRESS */
3689                 max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
3690                                         log_max_ft_size));
3691                 if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
3692                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3693         }
3694
3695         if (max_table_size < MLX5_FS_MAX_ENTRIES)
3696                 return ERR_PTR(-ENOMEM);
3697
3698         if (mcast)
3699                 priority = MLX5_IB_FLOW_MCAST_PRIO;
3700         else
3701                 priority = ib_prio_to_core_prio(fs_matcher->priority, false);
3702
3703         ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
3704         if (!ns)
3705                 return ERR_PTR(-ENOTSUPP);
3706
3707         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
3708                 prio = &dev->flow_db->prios[priority];
3709         else
3710                 prio = &dev->flow_db->egress_prios[priority];
3711
3712         if (prio->flow_table)
3713                 return prio;
3714
3715         return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES,
3716                          MLX5_FS_MAX_TYPES, flags);
3717 }
3718
3719 static struct mlx5_ib_flow_handler *
3720 _create_raw_flow_rule(struct mlx5_ib_dev *dev,
3721                       struct mlx5_ib_flow_prio *ft_prio,
3722                       struct mlx5_flow_destination *dst,
3723                       struct mlx5_ib_flow_matcher  *fs_matcher,
3724                       struct mlx5_flow_act *flow_act,
3725                       void *cmd_in, int inlen,
3726                       int dst_num)
3727 {
3728         struct mlx5_ib_flow_handler *handler;
3729         struct mlx5_flow_spec *spec;
3730         struct mlx5_flow_table *ft = ft_prio->flow_table;
3731         int err = 0;
3732
3733         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
3734         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
3735         if (!handler || !spec) {
3736                 err = -ENOMEM;
3737                 goto free;
3738         }
3739
3740         INIT_LIST_HEAD(&handler->list);
3741
3742         memcpy(spec->match_value, cmd_in, inlen);
3743         memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
3744                fs_matcher->mask_len);
3745         spec->match_criteria_enable = fs_matcher->match_criteria_enable;
3746
3747         handler->rule = mlx5_add_flow_rules(ft, spec,
3748                                             flow_act, dst, dst_num);
3749
3750         if (IS_ERR(handler->rule)) {
3751                 err = PTR_ERR(handler->rule);
3752                 goto free;
3753         }
3754
3755         ft_prio->refcount++;
3756         handler->prio = ft_prio;
3757         handler->dev = dev;
3758         ft_prio->flow_table = ft;
3759
3760 free:
3761         if (err)
3762                 kfree(handler);
3763         kvfree(spec);
3764         return err ? ERR_PTR(err) : handler;
3765 }
3766
3767 static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
3768                                 void *match_v)
3769 {
3770         void *match_c;
3771         void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
3772         void *dmac, *dmac_mask;
3773         void *ipv4, *ipv4_mask;
3774
3775         if (!(fs_matcher->match_criteria_enable &
3776               (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
3777                 return false;
3778
3779         match_c = fs_matcher->matcher_mask.match_params;
3780         match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
3781                                            outer_headers);
3782         match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
3783                                            outer_headers);
3784
3785         dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
3786                             dmac_47_16);
3787         dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
3788                                  dmac_47_16);
3789
3790         if (is_multicast_ether_addr(dmac) &&
3791             is_multicast_ether_addr(dmac_mask))
3792                 return true;
3793
3794         ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
3795                             dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
3796
3797         ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
3798                                  dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
3799
3800         if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
3801             ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
3802                 return true;
3803
3804         return false;
3805 }
3806
3807 struct mlx5_ib_flow_handler *
3808 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
3809                         struct mlx5_ib_flow_matcher *fs_matcher,
3810                         struct mlx5_flow_act *flow_act,
3811                         u32 counter_id,
3812                         void *cmd_in, int inlen, int dest_id,
3813                         int dest_type)
3814 {
3815         struct mlx5_flow_destination *dst;
3816         struct mlx5_ib_flow_prio *ft_prio;
3817         struct mlx5_ib_flow_handler *handler;
3818         int dst_num = 0;
3819         bool mcast;
3820         int err;
3821
3822         if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
3823                 return ERR_PTR(-EOPNOTSUPP);
3824
3825         if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
3826                 return ERR_PTR(-ENOMEM);
3827
3828         dst = kzalloc(sizeof(*dst) * 2, GFP_KERNEL);
3829         if (!dst)
3830                 return ERR_PTR(-ENOMEM);
3831
3832         mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
3833         mutex_lock(&dev->flow_db->lock);
3834
3835         ft_prio = _get_flow_table(dev, fs_matcher, mcast);
3836         if (IS_ERR(ft_prio)) {
3837                 err = PTR_ERR(ft_prio);
3838                 goto unlock;
3839         }
3840
3841         if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
3842                 dst[dst_num].type = dest_type;
3843                 dst[dst_num].tir_num = dest_id;
3844                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
3845         } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
3846                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
3847                 dst[dst_num].ft_num = dest_id;
3848                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
3849         } else {
3850                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
3851                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
3852         }
3853
3854         dst_num++;
3855
3856         if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
3857                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
3858                 dst[dst_num].counter_id = counter_id;
3859                 dst_num++;
3860         }
3861
3862         handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act,
3863                                         cmd_in, inlen, dst_num);
3864
3865         if (IS_ERR(handler)) {
3866                 err = PTR_ERR(handler);
3867                 goto destroy_ft;
3868         }
3869
3870         mutex_unlock(&dev->flow_db->lock);
3871         atomic_inc(&fs_matcher->usecnt);
3872         handler->flow_matcher = fs_matcher;
3873
3874         kfree(dst);
3875
3876         return handler;
3877
3878 destroy_ft:
3879         put_flow_table(dev, ft_prio, false);
3880 unlock:
3881         mutex_unlock(&dev->flow_db->lock);
3882         kfree(dst);
3883
3884         return ERR_PTR(err);
3885 }
3886
3887 static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
3888 {
3889         u32 flags = 0;
3890
3891         if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
3892                 flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
3893
3894         return flags;
3895 }
3896
3897 #define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED      MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
3898 static struct ib_flow_action *
3899 mlx5_ib_create_flow_action_esp(struct ib_device *device,
3900                                const struct ib_flow_action_attrs_esp *attr,
3901                                struct uverbs_attr_bundle *attrs)
3902 {
3903         struct mlx5_ib_dev *mdev = to_mdev(device);
3904         struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
3905         struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
3906         struct mlx5_ib_flow_action *action;
3907         u64 action_flags;
3908         u64 flags;
3909         int err = 0;
3910
3911         err = uverbs_get_flags64(
3912                 &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
3913                 ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
3914         if (err)
3915                 return ERR_PTR(err);
3916
3917         flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
3918
3919         /* We current only support a subset of the standard features. Only a
3920          * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
3921          * (with overlap). Full offload mode isn't supported.
3922          */
3923         if (!attr->keymat || attr->replay || attr->encap ||
3924             attr->spi || attr->seq || attr->tfc_pad ||
3925             attr->hard_limit_pkts ||
3926             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
3927                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
3928                 return ERR_PTR(-EOPNOTSUPP);
3929
3930         if (attr->keymat->protocol !=
3931             IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
3932                 return ERR_PTR(-EOPNOTSUPP);
3933
3934         aes_gcm = &attr->keymat->keymat.aes_gcm;
3935
3936         if (aes_gcm->icv_len != 16 ||
3937             aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
3938                 return ERR_PTR(-EOPNOTSUPP);
3939
3940         action = kmalloc(sizeof(*action), GFP_KERNEL);
3941         if (!action)
3942                 return ERR_PTR(-ENOMEM);
3943
3944         action->esp_aes_gcm.ib_flags = attr->flags;
3945         memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
3946                sizeof(accel_attrs.keymat.aes_gcm.aes_key));
3947         accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
3948         memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
3949                sizeof(accel_attrs.keymat.aes_gcm.salt));
3950         memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
3951                sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
3952         accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
3953         accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
3954         accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
3955
3956         accel_attrs.esn = attr->esn;
3957         if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
3958                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
3959         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
3960                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
3961
3962         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
3963                 accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
3964
3965         action->esp_aes_gcm.ctx =
3966                 mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
3967         if (IS_ERR(action->esp_aes_gcm.ctx)) {
3968                 err = PTR_ERR(action->esp_aes_gcm.ctx);
3969                 goto err_parse;
3970         }
3971
3972         action->esp_aes_gcm.ib_flags = attr->flags;
3973
3974         return &action->ib_action;
3975
3976 err_parse:
3977         kfree(action);
3978         return ERR_PTR(err);
3979 }
3980
3981 static int
3982 mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
3983                                const struct ib_flow_action_attrs_esp *attr,
3984                                struct uverbs_attr_bundle *attrs)
3985 {
3986         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
3987         struct mlx5_accel_esp_xfrm_attrs accel_attrs;
3988         int err = 0;
3989
3990         if (attr->keymat || attr->replay || attr->encap ||
3991             attr->spi || attr->seq || attr->tfc_pad ||
3992             attr->hard_limit_pkts ||
3993             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
3994                              IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
3995                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
3996                 return -EOPNOTSUPP;
3997
3998         /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
3999          * be modified.
4000          */
4001         if (!(maction->esp_aes_gcm.ib_flags &
4002               IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
4003             attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4004                            IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
4005                 return -EINVAL;
4006
4007         memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
4008                sizeof(accel_attrs));
4009
4010         accel_attrs.esn = attr->esn;
4011         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
4012                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4013         else
4014                 accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4015
4016         err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
4017                                          &accel_attrs);
4018         if (err)
4019                 return err;
4020
4021         maction->esp_aes_gcm.ib_flags &=
4022                 ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4023         maction->esp_aes_gcm.ib_flags |=
4024                 attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4025
4026         return 0;
4027 }
4028
4029 static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
4030 {
4031         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
4032
4033         switch (action->type) {
4034         case IB_FLOW_ACTION_ESP:
4035                 /*
4036                  * We only support aes_gcm by now, so we implicitly know this is
4037                  * the underline crypto.
4038                  */
4039                 mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
4040                 break;
4041         case IB_FLOW_ACTION_UNSPECIFIED:
4042                 mlx5_ib_destroy_flow_action_raw(maction);
4043                 break;
4044         default:
4045                 WARN_ON(true);
4046                 break;
4047         }
4048
4049         kfree(maction);
4050         return 0;
4051 }
4052
4053 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4054 {
4055         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4056         struct mlx5_ib_qp *mqp = to_mqp(ibqp);
4057         int err;
4058         u16 uid;
4059
4060         uid = ibqp->pd ?
4061                 to_mpd(ibqp->pd)->uid : 0;
4062
4063         if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
4064                 mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
4065                 return -EOPNOTSUPP;
4066         }
4067
4068         err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4069         if (err)
4070                 mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
4071                              ibqp->qp_num, gid->raw);
4072
4073         return err;
4074 }
4075
4076 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4077 {
4078         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4079         int err;
4080         u16 uid;
4081
4082         uid = ibqp->pd ?
4083                 to_mpd(ibqp->pd)->uid : 0;
4084         err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4085         if (err)
4086                 mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
4087                              ibqp->qp_num, gid->raw);
4088
4089         return err;
4090 }
4091
4092 static int init_node_data(struct mlx5_ib_dev *dev)
4093 {
4094         int err;
4095
4096         err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
4097         if (err)
4098                 return err;
4099
4100         dev->mdev->rev_id = dev->mdev->pdev->revision;
4101
4102         return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
4103 }
4104
4105 static ssize_t fw_pages_show(struct device *device,
4106                              struct device_attribute *attr, char *buf)
4107 {
4108         struct mlx5_ib_dev *dev =
4109                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
4110
4111         return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
4112 }
4113 static DEVICE_ATTR_RO(fw_pages);
4114
4115 static ssize_t reg_pages_show(struct device *device,
4116                               struct device_attribute *attr, char *buf)
4117 {
4118         struct mlx5_ib_dev *dev =
4119                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
4120
4121         return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
4122 }
4123 static DEVICE_ATTR_RO(reg_pages);
4124
4125 static ssize_t hca_type_show(struct device *device,
4126                              struct device_attribute *attr, char *buf)
4127 {
4128         struct mlx5_ib_dev *dev =
4129                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
4130         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
4131 }
4132 static DEVICE_ATTR_RO(hca_type);
4133
4134 static ssize_t hw_rev_show(struct device *device,
4135                            struct device_attribute *attr, char *buf)
4136 {
4137         struct mlx5_ib_dev *dev =
4138                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
4139         return sprintf(buf, "%x\n", dev->mdev->rev_id);
4140 }
4141 static DEVICE_ATTR_RO(hw_rev);
4142
4143 static ssize_t board_id_show(struct device *device,
4144                              struct device_attribute *attr, char *buf)
4145 {
4146         struct mlx5_ib_dev *dev =
4147                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
4148         return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
4149                        dev->mdev->board_id);
4150 }
4151 static DEVICE_ATTR_RO(board_id);
4152
4153 static struct attribute *mlx5_class_attributes[] = {
4154         &dev_attr_hw_rev.attr,
4155         &dev_attr_hca_type.attr,
4156         &dev_attr_board_id.attr,
4157         &dev_attr_fw_pages.attr,
4158         &dev_attr_reg_pages.attr,
4159         NULL,
4160 };
4161
4162 static const struct attribute_group mlx5_attr_group = {
4163         .attrs = mlx5_class_attributes,
4164 };
4165
4166 static void pkey_change_handler(struct work_struct *work)
4167 {
4168         struct mlx5_ib_port_resources *ports =
4169                 container_of(work, struct mlx5_ib_port_resources,
4170                              pkey_change_work);
4171
4172         mutex_lock(&ports->devr->mutex);
4173         mlx5_ib_gsi_pkey_change(ports->gsi);
4174         mutex_unlock(&ports->devr->mutex);
4175 }
4176
4177 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
4178 {
4179         struct mlx5_ib_qp *mqp;
4180         struct mlx5_ib_cq *send_mcq, *recv_mcq;
4181         struct mlx5_core_cq *mcq;
4182         struct list_head cq_armed_list;
4183         unsigned long flags_qp;
4184         unsigned long flags_cq;
4185         unsigned long flags;
4186
4187         INIT_LIST_HEAD(&cq_armed_list);
4188
4189         /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
4190         spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
4191         list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
4192                 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
4193                 if (mqp->sq.tail != mqp->sq.head) {
4194                         send_mcq = to_mcq(mqp->ibqp.send_cq);
4195                         spin_lock_irqsave(&send_mcq->lock, flags_cq);
4196                         if (send_mcq->mcq.comp &&
4197                             mqp->ibqp.send_cq->comp_handler) {
4198                                 if (!send_mcq->mcq.reset_notify_added) {
4199                                         send_mcq->mcq.reset_notify_added = 1;
4200                                         list_add_tail(&send_mcq->mcq.reset_notify,
4201                                                       &cq_armed_list);
4202                                 }
4203                         }
4204                         spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
4205                 }
4206                 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
4207                 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
4208                 /* no handling is needed for SRQ */
4209                 if (!mqp->ibqp.srq) {
4210                         if (mqp->rq.tail != mqp->rq.head) {
4211                                 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
4212                                 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
4213                                 if (recv_mcq->mcq.comp &&
4214                                     mqp->ibqp.recv_cq->comp_handler) {
4215                                         if (!recv_mcq->mcq.reset_notify_added) {
4216                                                 recv_mcq->mcq.reset_notify_added = 1;
4217                                                 list_add_tail(&recv_mcq->mcq.reset_notify,
4218                                                               &cq_armed_list);
4219                                         }
4220                                 }
4221                                 spin_unlock_irqrestore(&recv_mcq->lock,
4222                                                        flags_cq);
4223                         }
4224                 }
4225                 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
4226         }
4227         /*At that point all inflight post send were put to be executed as of we
4228          * lock/unlock above locks Now need to arm all involved CQs.
4229          */
4230         list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
4231                 mcq->comp(mcq);
4232         }
4233         spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
4234 }
4235
4236 static void delay_drop_handler(struct work_struct *work)
4237 {
4238         int err;
4239         struct mlx5_ib_delay_drop *delay_drop =
4240                 container_of(work, struct mlx5_ib_delay_drop,
4241                              delay_drop_work);
4242
4243         atomic_inc(&delay_drop->events_cnt);
4244
4245         mutex_lock(&delay_drop->lock);
4246         err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
4247                                        delay_drop->timeout);
4248         if (err) {
4249                 mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
4250                              delay_drop->timeout);
4251                 delay_drop->activate = false;
4252         }
4253         mutex_unlock(&delay_drop->lock);
4254 }
4255
4256 static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4257                                  struct ib_event *ibev)
4258 {
4259         switch (eqe->sub_type) {
4260         case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
4261                 schedule_work(&ibdev->delay_drop.delay_drop_work);
4262                 break;
4263         default: /* do nothing */
4264                 return;
4265         }
4266 }
4267
4268 static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4269                               struct ib_event *ibev)
4270 {
4271         u8 port = (eqe->data.port.port >> 4) & 0xf;
4272
4273         ibev->element.port_num = port;
4274
4275         switch (eqe->sub_type) {
4276         case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
4277         case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
4278         case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
4279                 /* In RoCE, port up/down events are handled in
4280                  * mlx5_netdev_event().
4281                  */
4282                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
4283                                             IB_LINK_LAYER_ETHERNET)
4284                         return -EINVAL;
4285
4286                 ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
4287                                 IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
4288                 break;
4289
4290         case MLX5_PORT_CHANGE_SUBTYPE_LID:
4291                 ibev->event = IB_EVENT_LID_CHANGE;
4292                 break;
4293
4294         case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
4295                 ibev->event = IB_EVENT_PKEY_CHANGE;
4296                 schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
4297                 break;
4298
4299         case MLX5_PORT_CHANGE_SUBTYPE_GUID:
4300                 ibev->event = IB_EVENT_GID_CHANGE;
4301                 break;
4302
4303         case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
4304                 ibev->event = IB_EVENT_CLIENT_REREGISTER;
4305                 break;
4306         default:
4307                 return -EINVAL;
4308         }
4309
4310         return 0;
4311 }
4312
4313 static void mlx5_ib_handle_event(struct work_struct *_work)
4314 {
4315         struct mlx5_ib_event_work *work =
4316                 container_of(_work, struct mlx5_ib_event_work, work);
4317         struct mlx5_ib_dev *ibdev;
4318         struct ib_event ibev;
4319         bool fatal = false;
4320
4321         if (work->is_slave) {
4322                 ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
4323                 if (!ibdev)
4324                         goto out;
4325         } else {
4326                 ibdev = work->dev;
4327         }
4328
4329         switch (work->event) {
4330         case MLX5_DEV_EVENT_SYS_ERROR:
4331                 ibev.event = IB_EVENT_DEVICE_FATAL;
4332                 mlx5_ib_handle_internal_error(ibdev);
4333                 ibev.element.port_num  = (u8)(unsigned long)work->param;
4334                 fatal = true;
4335                 break;
4336         case MLX5_EVENT_TYPE_PORT_CHANGE:
4337                 if (handle_port_change(ibdev, work->param, &ibev))
4338                         goto out;
4339                 break;
4340         case MLX5_EVENT_TYPE_GENERAL_EVENT:
4341                 handle_general_event(ibdev, work->param, &ibev);
4342                 /* fall through */
4343         default:
4344                 goto out;
4345         }
4346
4347         ibev.device = &ibdev->ib_dev;
4348
4349         if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
4350                 mlx5_ib_warn(ibdev, "warning: event on port %d\n",  ibev.element.port_num);
4351                 goto out;
4352         }
4353
4354         if (ibdev->ib_active)
4355                 ib_dispatch_event(&ibev);
4356
4357         if (fatal)
4358                 ibdev->ib_active = false;
4359 out:
4360         kfree(work);
4361 }
4362
4363 static int mlx5_ib_event(struct notifier_block *nb,
4364                          unsigned long event, void *param)
4365 {
4366         struct mlx5_ib_event_work *work;
4367
4368         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4369         if (!work)
4370                 return NOTIFY_DONE;
4371
4372         INIT_WORK(&work->work, mlx5_ib_handle_event);
4373         work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
4374         work->is_slave = false;
4375         work->param = param;
4376         work->event = event;
4377
4378         queue_work(mlx5_ib_event_wq, &work->work);
4379
4380         return NOTIFY_OK;
4381 }
4382
4383 static int mlx5_ib_event_slave_port(struct notifier_block *nb,
4384                                     unsigned long event, void *param)
4385 {
4386         struct mlx5_ib_event_work *work;
4387
4388         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4389         if (!work)
4390                 return NOTIFY_DONE;
4391
4392         INIT_WORK(&work->work, mlx5_ib_handle_event);
4393         work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
4394         work->is_slave = true;
4395         work->param = param;
4396         work->event = event;
4397         queue_work(mlx5_ib_event_wq, &work->work);
4398
4399         return NOTIFY_OK;
4400 }
4401
4402 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
4403 {
4404         struct mlx5_hca_vport_context vport_ctx;
4405         int err;
4406         int port;
4407
4408         for (port = 1; port <= dev->num_ports; port++) {
4409                 dev->mdev->port_caps[port - 1].has_smi = false;
4410                 if (MLX5_CAP_GEN(dev->mdev, port_type) ==
4411                     MLX5_CAP_PORT_TYPE_IB) {
4412                         if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
4413                                 err = mlx5_query_hca_vport_context(dev->mdev, 0,
4414                                                                    port, 0,
4415                                                                    &vport_ctx);
4416                                 if (err) {
4417                                         mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
4418                                                     port, err);
4419                                         return err;
4420                                 }
4421                                 dev->mdev->port_caps[port - 1].has_smi =
4422                                         vport_ctx.has_smi;
4423                         } else {
4424                                 dev->mdev->port_caps[port - 1].has_smi = true;
4425                         }
4426                 }
4427         }
4428         return 0;
4429 }
4430
4431 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
4432 {
4433         int port;
4434
4435         for (port = 1; port <= dev->num_ports; port++)
4436                 mlx5_query_ext_port_caps(dev, port);
4437 }
4438
4439 static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
4440 {
4441         struct ib_device_attr *dprops = NULL;
4442         struct ib_port_attr *pprops = NULL;
4443         int err = -ENOMEM;
4444         struct ib_udata uhw = {.inlen = 0, .outlen = 0};
4445
4446         pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
4447         if (!pprops)
4448                 goto out;
4449
4450         dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
4451         if (!dprops)
4452                 goto out;
4453
4454         err = set_has_smi_cap(dev);
4455         if (err)
4456                 goto out;
4457
4458         err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
4459         if (err) {
4460                 mlx5_ib_warn(dev, "query_device failed %d\n", err);
4461                 goto out;
4462         }
4463
4464         memset(pprops, 0, sizeof(*pprops));
4465         err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
4466         if (err) {
4467                 mlx5_ib_warn(dev, "query_port %d failed %d\n",
4468                              port, err);
4469                 goto out;
4470         }
4471
4472         dev->mdev->port_caps[port - 1].pkey_table_len =
4473                                         dprops->max_pkeys;
4474         dev->mdev->port_caps[port - 1].gid_table_len =
4475                                         pprops->gid_tbl_len;
4476         mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
4477                     port, dprops->max_pkeys, pprops->gid_tbl_len);
4478
4479 out:
4480         kfree(pprops);
4481         kfree(dprops);
4482
4483         return err;
4484 }
4485
4486 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
4487 {
4488         int err;
4489
4490         err = mlx5_mr_cache_cleanup(dev);
4491         if (err)
4492                 mlx5_ib_warn(dev, "mr cache cleanup failed\n");
4493
4494         if (dev->umrc.qp)
4495                 mlx5_ib_destroy_qp(dev->umrc.qp);
4496         if (dev->umrc.cq)
4497                 ib_free_cq(dev->umrc.cq);
4498         if (dev->umrc.pd)
4499                 ib_dealloc_pd(dev->umrc.pd);
4500 }
4501
4502 enum {
4503         MAX_UMR_WR = 128,
4504 };
4505
4506 static int create_umr_res(struct mlx5_ib_dev *dev)
4507 {
4508         struct ib_qp_init_attr *init_attr = NULL;
4509         struct ib_qp_attr *attr = NULL;
4510         struct ib_pd *pd;
4511         struct ib_cq *cq;
4512         struct ib_qp *qp;
4513         int ret;
4514
4515         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
4516         init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
4517         if (!attr || !init_attr) {
4518                 ret = -ENOMEM;
4519                 goto error_0;
4520         }
4521
4522         pd = ib_alloc_pd(&dev->ib_dev, 0);
4523         if (IS_ERR(pd)) {
4524                 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
4525                 ret = PTR_ERR(pd);
4526                 goto error_0;
4527         }
4528
4529         cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
4530         if (IS_ERR(cq)) {
4531                 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
4532                 ret = PTR_ERR(cq);
4533                 goto error_2;
4534         }
4535
4536         init_attr->send_cq = cq;
4537         init_attr->recv_cq = cq;
4538         init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
4539         init_attr->cap.max_send_wr = MAX_UMR_WR;
4540         init_attr->cap.max_send_sge = 1;
4541         init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
4542         init_attr->port_num = 1;
4543         qp = mlx5_ib_create_qp(pd, init_attr, NULL);
4544         if (IS_ERR(qp)) {
4545                 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
4546                 ret = PTR_ERR(qp);
4547                 goto error_3;
4548         }
4549         qp->device     = &dev->ib_dev;
4550         qp->real_qp    = qp;
4551         qp->uobject    = NULL;
4552         qp->qp_type    = MLX5_IB_QPT_REG_UMR;
4553         qp->send_cq    = init_attr->send_cq;
4554         qp->recv_cq    = init_attr->recv_cq;
4555
4556         attr->qp_state = IB_QPS_INIT;
4557         attr->port_num = 1;
4558         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
4559                                 IB_QP_PORT, NULL);
4560         if (ret) {
4561                 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
4562                 goto error_4;
4563         }
4564
4565         memset(attr, 0, sizeof(*attr));
4566         attr->qp_state = IB_QPS_RTR;
4567         attr->path_mtu = IB_MTU_256;
4568
4569         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4570         if (ret) {
4571                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
4572                 goto error_4;
4573         }
4574
4575         memset(attr, 0, sizeof(*attr));
4576         attr->qp_state = IB_QPS_RTS;
4577         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4578         if (ret) {
4579                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
4580                 goto error_4;
4581         }
4582
4583         dev->umrc.qp = qp;
4584         dev->umrc.cq = cq;
4585         dev->umrc.pd = pd;
4586
4587         sema_init(&dev->umrc.sem, MAX_UMR_WR);
4588         ret = mlx5_mr_cache_init(dev);
4589         if (ret) {
4590                 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
4591                 goto error_4;
4592         }
4593
4594         kfree(attr);
4595         kfree(init_attr);
4596
4597         return 0;
4598
4599 error_4:
4600         mlx5_ib_destroy_qp(qp);
4601         dev->umrc.qp = NULL;
4602
4603 error_3:
4604         ib_free_cq(cq);
4605         dev->umrc.cq = NULL;
4606
4607 error_2:
4608         ib_dealloc_pd(pd);
4609         dev->umrc.pd = NULL;
4610
4611 error_0:
4612         kfree(attr);
4613         kfree(init_attr);
4614         return ret;
4615 }
4616
4617 static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
4618 {
4619         switch (umr_fence_cap) {
4620         case MLX5_CAP_UMR_FENCE_NONE:
4621                 return MLX5_FENCE_MODE_NONE;
4622         case MLX5_CAP_UMR_FENCE_SMALL:
4623                 return MLX5_FENCE_MODE_INITIATOR_SMALL;
4624         default:
4625                 return MLX5_FENCE_MODE_STRONG_ORDERING;
4626         }
4627 }
4628
4629 static int create_dev_resources(struct mlx5_ib_resources *devr)
4630 {
4631         struct ib_srq_init_attr attr;
4632         struct mlx5_ib_dev *dev;
4633         struct ib_cq_init_attr cq_attr = {.cqe = 1};
4634         int port;
4635         int ret = 0;
4636
4637         dev = container_of(devr, struct mlx5_ib_dev, devr);
4638
4639         mutex_init(&devr->mutex);
4640
4641         devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
4642         if (IS_ERR(devr->p0)) {
4643                 ret = PTR_ERR(devr->p0);
4644                 goto error0;
4645         }
4646         devr->p0->device  = &dev->ib_dev;
4647         devr->p0->uobject = NULL;
4648         atomic_set(&devr->p0->usecnt, 0);
4649
4650         devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
4651         if (IS_ERR(devr->c0)) {
4652                 ret = PTR_ERR(devr->c0);
4653                 goto error1;
4654         }
4655         devr->c0->device        = &dev->ib_dev;
4656         devr->c0->uobject       = NULL;
4657         devr->c0->comp_handler  = NULL;
4658         devr->c0->event_handler = NULL;
4659         devr->c0->cq_context    = NULL;
4660         atomic_set(&devr->c0->usecnt, 0);
4661
4662         devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
4663         if (IS_ERR(devr->x0)) {
4664                 ret = PTR_ERR(devr->x0);
4665                 goto error2;
4666         }
4667         devr->x0->device = &dev->ib_dev;
4668         devr->x0->inode = NULL;
4669         atomic_set(&devr->x0->usecnt, 0);
4670         mutex_init(&devr->x0->tgt_qp_mutex);
4671         INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
4672
4673         devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
4674         if (IS_ERR(devr->x1)) {
4675                 ret = PTR_ERR(devr->x1);
4676                 goto error3;
4677         }
4678         devr->x1->device = &dev->ib_dev;
4679         devr->x1->inode = NULL;
4680         atomic_set(&devr->x1->usecnt, 0);
4681         mutex_init(&devr->x1->tgt_qp_mutex);
4682         INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
4683
4684         memset(&attr, 0, sizeof(attr));
4685         attr.attr.max_sge = 1;
4686         attr.attr.max_wr = 1;
4687         attr.srq_type = IB_SRQT_XRC;
4688         attr.ext.cq = devr->c0;
4689         attr.ext.xrc.xrcd = devr->x0;
4690
4691         devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
4692         if (IS_ERR(devr->s0)) {
4693                 ret = PTR_ERR(devr->s0);
4694                 goto error4;
4695         }
4696         devr->s0->device        = &dev->ib_dev;
4697         devr->s0->pd            = devr->p0;
4698         devr->s0->uobject       = NULL;
4699         devr->s0->event_handler = NULL;
4700         devr->s0->srq_context   = NULL;
4701         devr->s0->srq_type      = IB_SRQT_XRC;
4702         devr->s0->ext.xrc.xrcd  = devr->x0;
4703         devr->s0->ext.cq        = devr->c0;
4704         atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
4705         atomic_inc(&devr->s0->ext.cq->usecnt);
4706         atomic_inc(&devr->p0->usecnt);
4707         atomic_set(&devr->s0->usecnt, 0);
4708
4709         memset(&attr, 0, sizeof(attr));
4710         attr.attr.max_sge = 1;
4711         attr.attr.max_wr = 1;
4712         attr.srq_type = IB_SRQT_BASIC;
4713         devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
4714         if (IS_ERR(devr->s1)) {
4715                 ret = PTR_ERR(devr->s1);
4716                 goto error5;
4717         }
4718         devr->s1->device        = &dev->ib_dev;
4719         devr->s1->pd            = devr->p0;
4720         devr->s1->uobject       = NULL;
4721         devr->s1->event_handler = NULL;
4722         devr->s1->srq_context   = NULL;
4723         devr->s1->srq_type      = IB_SRQT_BASIC;
4724         devr->s1->ext.cq        = devr->c0;
4725         atomic_inc(&devr->p0->usecnt);
4726         atomic_set(&devr->s1->usecnt, 0);
4727
4728         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
4729                 INIT_WORK(&devr->ports[port].pkey_change_work,
4730                           pkey_change_handler);
4731                 devr->ports[port].devr = devr;
4732         }
4733
4734         return 0;
4735
4736 error5:
4737         mlx5_ib_destroy_srq(devr->s0);
4738 error4:
4739         mlx5_ib_dealloc_xrcd(devr->x1);
4740 error3:
4741         mlx5_ib_dealloc_xrcd(devr->x0);
4742 error2:
4743         mlx5_ib_destroy_cq(devr->c0);
4744 error1:
4745         mlx5_ib_dealloc_pd(devr->p0);
4746 error0:
4747         return ret;
4748 }
4749
4750 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
4751 {
4752         struct mlx5_ib_dev *dev =
4753                 container_of(devr, struct mlx5_ib_dev, devr);
4754         int port;
4755
4756         mlx5_ib_destroy_srq(devr->s1);
4757         mlx5_ib_destroy_srq(devr->s0);
4758         mlx5_ib_dealloc_xrcd(devr->x0);
4759         mlx5_ib_dealloc_xrcd(devr->x1);
4760         mlx5_ib_destroy_cq(devr->c0);
4761         mlx5_ib_dealloc_pd(devr->p0);
4762
4763         /* Make sure no change P_Key work items are still executing */
4764         for (port = 0; port < dev->num_ports; ++port)
4765                 cancel_work_sync(&devr->ports[port].pkey_change_work);
4766 }
4767
4768 static u32 get_core_cap_flags(struct ib_device *ibdev,
4769                               struct mlx5_hca_vport_context *rep)
4770 {
4771         struct mlx5_ib_dev *dev = to_mdev(ibdev);
4772         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
4773         u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
4774         u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
4775         bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
4776         u32 ret = 0;
4777
4778         if (rep->grh_required)
4779                 ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
4780
4781         if (ll == IB_LINK_LAYER_INFINIBAND)
4782                 return ret | RDMA_CORE_PORT_IBA_IB;
4783
4784         if (raw_support)
4785                 ret |= RDMA_CORE_PORT_RAW_PACKET;
4786
4787         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
4788                 return ret;
4789
4790         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
4791                 return ret;
4792
4793         if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
4794                 ret |= RDMA_CORE_PORT_IBA_ROCE;
4795
4796         if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
4797                 ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
4798
4799         return ret;
4800 }
4801
4802 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
4803                                struct ib_port_immutable *immutable)
4804 {
4805         struct ib_port_attr attr;
4806         struct mlx5_ib_dev *dev = to_mdev(ibdev);
4807         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
4808         struct mlx5_hca_vport_context rep = {0};
4809         int err;
4810
4811         err = ib_query_port(ibdev, port_num, &attr);
4812         if (err)
4813                 return err;
4814
4815         if (ll == IB_LINK_LAYER_INFINIBAND) {
4816                 err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
4817                                                    &rep);
4818                 if (err)
4819                         return err;
4820         }
4821
4822         immutable->pkey_tbl_len = attr.pkey_tbl_len;
4823         immutable->gid_tbl_len = attr.gid_tbl_len;
4824         immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
4825         if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
4826                 immutable->max_mad_size = IB_MGMT_MAD_SIZE;
4827
4828         return 0;
4829 }
4830
4831 static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num,
4832                                    struct ib_port_immutable *immutable)
4833 {
4834         struct ib_port_attr attr;
4835         int err;
4836
4837         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
4838
4839         err = ib_query_port(ibdev, port_num, &attr);
4840         if (err)
4841                 return err;
4842
4843         immutable->pkey_tbl_len = attr.pkey_tbl_len;
4844         immutable->gid_tbl_len = attr.gid_tbl_len;
4845         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
4846
4847         return 0;
4848 }
4849
4850 static void get_dev_fw_str(struct ib_device *ibdev, char *str)
4851 {
4852         struct mlx5_ib_dev *dev =
4853                 container_of(ibdev, struct mlx5_ib_dev, ib_dev);
4854         snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
4855                  fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
4856                  fw_rev_sub(dev->mdev));
4857 }
4858
4859 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
4860 {
4861         struct mlx5_core_dev *mdev = dev->mdev;
4862         struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
4863                                                                  MLX5_FLOW_NAMESPACE_LAG);
4864         struct mlx5_flow_table *ft;
4865         int err;
4866
4867         if (!ns || !mlx5_lag_is_active(mdev))
4868                 return 0;
4869
4870         err = mlx5_cmd_create_vport_lag(mdev);
4871         if (err)
4872                 return err;
4873
4874         ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
4875         if (IS_ERR(ft)) {
4876                 err = PTR_ERR(ft);
4877                 goto err_destroy_vport_lag;
4878         }
4879
4880         dev->flow_db->lag_demux_ft = ft;
4881         return 0;
4882
4883 err_destroy_vport_lag:
4884         mlx5_cmd_destroy_vport_lag(mdev);
4885         return err;
4886 }
4887
4888 static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
4889 {
4890         struct mlx5_core_dev *mdev = dev->mdev;
4891
4892         if (dev->flow_db->lag_demux_ft) {
4893                 mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
4894                 dev->flow_db->lag_demux_ft = NULL;
4895
4896                 mlx5_cmd_destroy_vport_lag(mdev);
4897         }
4898 }
4899
4900 static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
4901 {
4902         int err;
4903
4904         dev->roce[port_num].nb.notifier_call = mlx5_netdev_event;
4905         err = register_netdevice_notifier(&dev->roce[port_num].nb);
4906         if (err) {
4907                 dev->roce[port_num].nb.notifier_call = NULL;
4908                 return err;
4909         }
4910
4911         return 0;
4912 }
4913
4914 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
4915 {
4916         if (dev->roce[port_num].nb.notifier_call) {
4917                 unregister_netdevice_notifier(&dev->roce[port_num].nb);
4918                 dev->roce[port_num].nb.notifier_call = NULL;
4919         }
4920 }
4921
4922 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
4923 {
4924         int err;
4925
4926         if (MLX5_CAP_GEN(dev->mdev, roce)) {
4927                 err = mlx5_nic_vport_enable_roce(dev->mdev);
4928                 if (err)
4929                         return err;
4930         }
4931
4932         err = mlx5_eth_lag_init(dev);
4933         if (err)
4934                 goto err_disable_roce;
4935
4936         return 0;
4937
4938 err_disable_roce:
4939         if (MLX5_CAP_GEN(dev->mdev, roce))
4940                 mlx5_nic_vport_disable_roce(dev->mdev);
4941
4942         return err;
4943 }
4944
4945 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
4946 {
4947         mlx5_eth_lag_cleanup(dev);
4948         if (MLX5_CAP_GEN(dev->mdev, roce))
4949                 mlx5_nic_vport_disable_roce(dev->mdev);
4950 }
4951
4952 struct mlx5_ib_counter {
4953         const char *name;
4954         size_t offset;
4955 };
4956
4957 #define INIT_Q_COUNTER(_name)           \
4958         { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
4959
4960 static const struct mlx5_ib_counter basic_q_cnts[] = {
4961         INIT_Q_COUNTER(rx_write_requests),
4962         INIT_Q_COUNTER(rx_read_requests),
4963         INIT_Q_COUNTER(rx_atomic_requests),
4964         INIT_Q_COUNTER(out_of_buffer),
4965 };
4966
4967 static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
4968         INIT_Q_COUNTER(out_of_sequence),
4969 };
4970
4971 static const struct mlx5_ib_counter retrans_q_cnts[] = {
4972         INIT_Q_COUNTER(duplicate_request),
4973         INIT_Q_COUNTER(rnr_nak_retry_err),
4974         INIT_Q_COUNTER(packet_seq_err),
4975         INIT_Q_COUNTER(implied_nak_seq_err),
4976         INIT_Q_COUNTER(local_ack_timeout_err),
4977 };
4978
4979 #define INIT_CONG_COUNTER(_name)                \
4980         { .name = #_name, .offset =     \
4981                 MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
4982
4983 static const struct mlx5_ib_counter cong_cnts[] = {
4984         INIT_CONG_COUNTER(rp_cnp_ignored),
4985         INIT_CONG_COUNTER(rp_cnp_handled),
4986         INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
4987         INIT_CONG_COUNTER(np_cnp_sent),
4988 };
4989
4990 static const struct mlx5_ib_counter extended_err_cnts[] = {
4991         INIT_Q_COUNTER(resp_local_length_error),
4992         INIT_Q_COUNTER(resp_cqe_error),
4993         INIT_Q_COUNTER(req_cqe_error),
4994         INIT_Q_COUNTER(req_remote_invalid_request),
4995         INIT_Q_COUNTER(req_remote_access_errors),
4996         INIT_Q_COUNTER(resp_remote_access_errors),
4997         INIT_Q_COUNTER(resp_cqe_flush_error),
4998         INIT_Q_COUNTER(req_cqe_flush_error),
4999 };
5000
5001 #define INIT_EXT_PPCNT_COUNTER(_name)           \
5002         { .name = #_name, .offset =     \
5003         MLX5_BYTE_OFF(ppcnt_reg, \
5004                       counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
5005
5006 static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
5007         INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
5008 };
5009
5010 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
5011 {
5012         int i;
5013
5014         for (i = 0; i < dev->num_ports; i++) {
5015                 if (dev->port[i].cnts.set_id_valid)
5016                         mlx5_core_dealloc_q_counter(dev->mdev,
5017                                                     dev->port[i].cnts.set_id);
5018                 kfree(dev->port[i].cnts.names);
5019                 kfree(dev->port[i].cnts.offsets);
5020         }
5021 }
5022
5023 static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
5024                                     struct mlx5_ib_counters *cnts)
5025 {
5026         u32 num_counters;
5027
5028         num_counters = ARRAY_SIZE(basic_q_cnts);
5029
5030         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
5031                 num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
5032
5033         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
5034                 num_counters += ARRAY_SIZE(retrans_q_cnts);
5035
5036         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
5037                 num_counters += ARRAY_SIZE(extended_err_cnts);
5038
5039         cnts->num_q_counters = num_counters;
5040
5041         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5042                 cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
5043                 num_counters += ARRAY_SIZE(cong_cnts);
5044         }
5045         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5046                 cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
5047                 num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
5048         }
5049         cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
5050         if (!cnts->names)
5051                 return -ENOMEM;
5052
5053         cnts->offsets = kcalloc(num_counters,
5054                                 sizeof(cnts->offsets), GFP_KERNEL);
5055         if (!cnts->offsets)
5056                 goto err_names;
5057
5058         return 0;
5059
5060 err_names:
5061         kfree(cnts->names);
5062         cnts->names = NULL;
5063         return -ENOMEM;
5064 }
5065
5066 static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
5067                                   const char **names,
5068                                   size_t *offsets)
5069 {
5070         int i;
5071         int j = 0;
5072
5073         for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
5074                 names[j] = basic_q_cnts[i].name;
5075                 offsets[j] = basic_q_cnts[i].offset;
5076         }
5077
5078         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
5079                 for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
5080                         names[j] = out_of_seq_q_cnts[i].name;
5081                         offsets[j] = out_of_seq_q_cnts[i].offset;
5082                 }
5083         }
5084
5085         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
5086                 for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
5087                         names[j] = retrans_q_cnts[i].name;
5088                         offsets[j] = retrans_q_cnts[i].offset;
5089                 }
5090         }
5091
5092         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
5093                 for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
5094                         names[j] = extended_err_cnts[i].name;
5095                         offsets[j] = extended_err_cnts[i].offset;
5096                 }
5097         }
5098
5099         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5100                 for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
5101                         names[j] = cong_cnts[i].name;
5102                         offsets[j] = cong_cnts[i].offset;
5103                 }
5104         }
5105
5106         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5107                 for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
5108                         names[j] = ext_ppcnt_cnts[i].name;
5109                         offsets[j] = ext_ppcnt_cnts[i].offset;
5110                 }
5111         }
5112 }
5113
5114 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
5115 {
5116         int err = 0;
5117         int i;
5118
5119         for (i = 0; i < dev->num_ports; i++) {
5120                 err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
5121                 if (err)
5122                         goto err_alloc;
5123
5124                 mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
5125                                       dev->port[i].cnts.offsets);
5126
5127                 err = mlx5_core_alloc_q_counter(dev->mdev,
5128                                                 &dev->port[i].cnts.set_id);
5129                 if (err) {
5130                         mlx5_ib_warn(dev,
5131                                      "couldn't allocate queue counter for port %d, err %d\n",
5132                                      i + 1, err);
5133                         goto err_alloc;
5134                 }
5135                 dev->port[i].cnts.set_id_valid = true;
5136         }
5137
5138         return 0;
5139
5140 err_alloc:
5141         mlx5_ib_dealloc_counters(dev);
5142         return err;
5143 }
5144
5145 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
5146                                                     u8 port_num)
5147 {
5148         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5149         struct mlx5_ib_port *port = &dev->port[port_num - 1];
5150
5151         /* We support only per port stats */
5152         if (port_num == 0)
5153                 return NULL;
5154
5155         return rdma_alloc_hw_stats_struct(port->cnts.names,
5156                                           port->cnts.num_q_counters +
5157                                           port->cnts.num_cong_counters +
5158                                           port->cnts.num_ext_ppcnt_counters,
5159                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
5160 }
5161
5162 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
5163                                     struct mlx5_ib_port *port,
5164                                     struct rdma_hw_stats *stats)
5165 {
5166         int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
5167         void *out;
5168         __be32 val;
5169         int ret, i;
5170
5171         out = kvzalloc(outlen, GFP_KERNEL);
5172         if (!out)
5173                 return -ENOMEM;
5174
5175         ret = mlx5_core_query_q_counter(mdev,
5176                                         port->cnts.set_id, 0,
5177                                         out, outlen);
5178         if (ret)
5179                 goto free;
5180
5181         for (i = 0; i < port->cnts.num_q_counters; i++) {
5182                 val = *(__be32 *)(out + port->cnts.offsets[i]);
5183                 stats->value[i] = (u64)be32_to_cpu(val);
5184         }
5185
5186 free:
5187         kvfree(out);
5188         return ret;
5189 }
5190
5191 static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
5192                                           struct mlx5_ib_port *port,
5193                                           struct rdma_hw_stats *stats)
5194 {
5195         int offset = port->cnts.num_q_counters + port->cnts.num_cong_counters;
5196         int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
5197         int ret, i;
5198         void *out;
5199
5200         out = kvzalloc(sz, GFP_KERNEL);
5201         if (!out)
5202                 return -ENOMEM;
5203
5204         ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out);
5205         if (ret)
5206                 goto free;
5207
5208         for (i = 0; i < port->cnts.num_ext_ppcnt_counters; i++) {
5209                 stats->value[i + offset] =
5210                         be64_to_cpup((__be64 *)(out +
5211                                     port->cnts.offsets[i + offset]));
5212         }
5213
5214 free:
5215         kvfree(out);
5216         return ret;
5217 }
5218
5219 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
5220                                 struct rdma_hw_stats *stats,
5221                                 u8 port_num, int index)
5222 {
5223         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5224         struct mlx5_ib_port *port = &dev->port[port_num - 1];
5225         struct mlx5_core_dev *mdev;
5226         int ret, num_counters;
5227         u8 mdev_port_num;
5228
5229         if (!stats)
5230                 return -EINVAL;
5231
5232         num_counters = port->cnts.num_q_counters +
5233                        port->cnts.num_cong_counters +
5234                        port->cnts.num_ext_ppcnt_counters;
5235
5236         /* q_counters are per IB device, query the master mdev */
5237         ret = mlx5_ib_query_q_counters(dev->mdev, port, stats);
5238         if (ret)
5239                 return ret;
5240
5241         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5242                 ret =  mlx5_ib_query_ext_ppcnt_counters(dev, port, stats);
5243                 if (ret)
5244                         return ret;
5245         }
5246
5247         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5248                 mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
5249                                                     &mdev_port_num);
5250                 if (!mdev) {
5251                         /* If port is not affiliated yet, its in down state
5252                          * which doesn't have any counters yet, so it would be
5253                          * zero. So no need to read from the HCA.
5254                          */
5255                         goto done;
5256                 }
5257                 ret = mlx5_lag_query_cong_counters(dev->mdev,
5258                                                    stats->value +
5259                                                    port->cnts.num_q_counters,
5260                                                    port->cnts.num_cong_counters,
5261                                                    port->cnts.offsets +
5262                                                    port->cnts.num_q_counters);
5263
5264                 mlx5_ib_put_native_port_mdev(dev, port_num);
5265                 if (ret)
5266                         return ret;
5267         }
5268
5269 done:
5270         return num_counters;
5271 }
5272
5273 static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
5274                                  enum rdma_netdev_t type,
5275                                  struct rdma_netdev_alloc_params *params)
5276 {
5277         if (type != RDMA_NETDEV_IPOIB)
5278                 return -EOPNOTSUPP;
5279
5280         return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
5281 }
5282
5283 static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
5284 {
5285         if (!dev->delay_drop.dbg)
5286                 return;
5287         debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs);
5288         kfree(dev->delay_drop.dbg);
5289         dev->delay_drop.dbg = NULL;
5290 }
5291
5292 static void cancel_delay_drop(struct mlx5_ib_dev *dev)
5293 {
5294         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5295                 return;
5296
5297         cancel_work_sync(&dev->delay_drop.delay_drop_work);
5298         delay_drop_debugfs_cleanup(dev);
5299 }
5300
5301 static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
5302                                        size_t count, loff_t *pos)
5303 {
5304         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5305         char lbuf[20];
5306         int len;
5307
5308         len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
5309         return simple_read_from_buffer(buf, count, pos, lbuf, len);
5310 }
5311
5312 static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
5313                                         size_t count, loff_t *pos)
5314 {
5315         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5316         u32 timeout;
5317         u32 var;
5318
5319         if (kstrtouint_from_user(buf, count, 0, &var))
5320                 return -EFAULT;
5321
5322         timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
5323                         1000);
5324         if (timeout != var)
5325                 mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
5326                             timeout);
5327
5328         delay_drop->timeout = timeout;
5329
5330         return count;
5331 }
5332
5333 static const struct file_operations fops_delay_drop_timeout = {
5334         .owner  = THIS_MODULE,
5335         .open   = simple_open,
5336         .write  = delay_drop_timeout_write,
5337         .read   = delay_drop_timeout_read,
5338 };
5339
5340 static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
5341 {
5342         struct mlx5_ib_dbg_delay_drop *dbg;
5343
5344         if (!mlx5_debugfs_root)
5345                 return 0;
5346
5347         dbg = kzalloc(sizeof(*dbg), GFP_KERNEL);
5348         if (!dbg)
5349                 return -ENOMEM;
5350
5351         dev->delay_drop.dbg = dbg;
5352
5353         dbg->dir_debugfs =
5354                 debugfs_create_dir("delay_drop",
5355                                    dev->mdev->priv.dbg_root);
5356         if (!dbg->dir_debugfs)
5357                 goto out_debugfs;
5358
5359         dbg->events_cnt_debugfs =
5360                 debugfs_create_atomic_t("num_timeout_events", 0400,
5361                                         dbg->dir_debugfs,
5362                                         &dev->delay_drop.events_cnt);
5363         if (!dbg->events_cnt_debugfs)
5364                 goto out_debugfs;
5365
5366         dbg->rqs_cnt_debugfs =
5367                 debugfs_create_atomic_t("num_rqs", 0400,
5368                                         dbg->dir_debugfs,
5369                                         &dev->delay_drop.rqs_cnt);
5370         if (!dbg->rqs_cnt_debugfs)
5371                 goto out_debugfs;
5372
5373         dbg->timeout_debugfs =
5374                 debugfs_create_file("timeout", 0600,
5375                                     dbg->dir_debugfs,
5376                                     &dev->delay_drop,
5377                                     &fops_delay_drop_timeout);
5378         if (!dbg->timeout_debugfs)
5379                 goto out_debugfs;
5380
5381         return 0;
5382
5383 out_debugfs:
5384         delay_drop_debugfs_cleanup(dev);
5385         return -ENOMEM;
5386 }
5387
5388 static void init_delay_drop(struct mlx5_ib_dev *dev)
5389 {
5390         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5391                 return;
5392
5393         mutex_init(&dev->delay_drop.lock);
5394         dev->delay_drop.dev = dev;
5395         dev->delay_drop.activate = false;
5396         dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
5397         INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
5398         atomic_set(&dev->delay_drop.rqs_cnt, 0);
5399         atomic_set(&dev->delay_drop.events_cnt, 0);
5400
5401         if (delay_drop_debugfs_init(dev))
5402                 mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
5403 }
5404
5405 /* The mlx5_ib_multiport_mutex should be held when calling this function */
5406 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
5407                                       struct mlx5_ib_multiport_info *mpi)
5408 {
5409         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5410         struct mlx5_ib_port *port = &ibdev->port[port_num];
5411         int comps;
5412         int err;
5413         int i;
5414
5415         mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
5416
5417         spin_lock(&port->mp.mpi_lock);
5418         if (!mpi->ibdev) {
5419                 spin_unlock(&port->mp.mpi_lock);
5420                 return;
5421         }
5422
5423         if (mpi->mdev_events.notifier_call)
5424                 mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
5425         mpi->mdev_events.notifier_call = NULL;
5426
5427         mpi->ibdev = NULL;
5428
5429         spin_unlock(&port->mp.mpi_lock);
5430         mlx5_remove_netdev_notifier(ibdev, port_num);
5431         spin_lock(&port->mp.mpi_lock);
5432
5433         comps = mpi->mdev_refcnt;
5434         if (comps) {
5435                 mpi->unaffiliate = true;
5436                 init_completion(&mpi->unref_comp);
5437                 spin_unlock(&port->mp.mpi_lock);
5438
5439                 for (i = 0; i < comps; i++)
5440                         wait_for_completion(&mpi->unref_comp);
5441
5442                 spin_lock(&port->mp.mpi_lock);
5443                 mpi->unaffiliate = false;
5444         }
5445
5446         port->mp.mpi = NULL;
5447
5448         list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
5449
5450         spin_unlock(&port->mp.mpi_lock);
5451
5452         err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
5453
5454         mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
5455         /* Log an error, still needed to cleanup the pointers and add
5456          * it back to the list.
5457          */
5458         if (err)
5459                 mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
5460                             port_num + 1);
5461
5462         ibdev->roce[port_num].last_port_state = IB_PORT_DOWN;
5463 }
5464
5465 /* The mlx5_ib_multiport_mutex should be held when calling this function */
5466 static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
5467                                     struct mlx5_ib_multiport_info *mpi)
5468 {
5469         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5470         int err;
5471
5472         spin_lock(&ibdev->port[port_num].mp.mpi_lock);
5473         if (ibdev->port[port_num].mp.mpi) {
5474                 mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
5475                             port_num + 1);
5476                 spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5477                 return false;
5478         }
5479
5480         ibdev->port[port_num].mp.mpi = mpi;
5481         mpi->ibdev = ibdev;
5482         mpi->mdev_events.notifier_call = NULL;
5483         spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5484
5485         err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
5486         if (err)
5487                 goto unbind;
5488
5489         err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev));
5490         if (err)
5491                 goto unbind;
5492
5493         err = mlx5_add_netdev_notifier(ibdev, port_num);
5494         if (err) {
5495                 mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
5496                             port_num + 1);
5497                 goto unbind;
5498         }
5499
5500         mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
5501         mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
5502
5503         err = mlx5_ib_init_cong_debugfs(ibdev, port_num);
5504         if (err)
5505                 goto unbind;
5506
5507         return true;
5508
5509 unbind:
5510         mlx5_ib_unbind_slave_port(ibdev, mpi);
5511         return false;
5512 }
5513
5514 static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
5515 {
5516         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
5517         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
5518                                                           port_num + 1);
5519         struct mlx5_ib_multiport_info *mpi;
5520         int err;
5521         int i;
5522
5523         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
5524                 return 0;
5525
5526         err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
5527                                                      &dev->sys_image_guid);
5528         if (err)
5529                 return err;
5530
5531         err = mlx5_nic_vport_enable_roce(dev->mdev);
5532         if (err)
5533                 return err;
5534
5535         mutex_lock(&mlx5_ib_multiport_mutex);
5536         for (i = 0; i < dev->num_ports; i++) {
5537                 bool bound = false;
5538
5539                 /* build a stub multiport info struct for the native port. */
5540                 if (i == port_num) {
5541                         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
5542                         if (!mpi) {
5543                                 mutex_unlock(&mlx5_ib_multiport_mutex);
5544                                 mlx5_nic_vport_disable_roce(dev->mdev);
5545                                 return -ENOMEM;
5546                         }
5547
5548                         mpi->is_master = true;
5549                         mpi->mdev = dev->mdev;
5550                         mpi->sys_image_guid = dev->sys_image_guid;
5551                         dev->port[i].mp.mpi = mpi;
5552                         mpi->ibdev = dev;
5553                         mpi = NULL;
5554                         continue;
5555                 }
5556
5557                 list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
5558                                     list) {
5559                         if (dev->sys_image_guid == mpi->sys_image_guid &&
5560                             (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
5561                                 bound = mlx5_ib_bind_slave_port(dev, mpi);
5562                         }
5563
5564                         if (bound) {
5565                                 dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n");
5566                                 mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
5567                                 list_del(&mpi->list);
5568                                 break;
5569                         }
5570                 }
5571                 if (!bound) {
5572                         get_port_caps(dev, i + 1);
5573                         mlx5_ib_dbg(dev, "no free port found for port %d\n",
5574                                     i + 1);
5575                 }
5576         }
5577
5578         list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
5579         mutex_unlock(&mlx5_ib_multiport_mutex);
5580         return err;
5581 }
5582
5583 static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
5584 {
5585         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
5586         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
5587                                                           port_num + 1);
5588         int i;
5589
5590         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
5591                 return;
5592
5593         mutex_lock(&mlx5_ib_multiport_mutex);
5594         for (i = 0; i < dev->num_ports; i++) {
5595                 if (dev->port[i].mp.mpi) {
5596                         /* Destroy the native port stub */
5597                         if (i == port_num) {
5598                                 kfree(dev->port[i].mp.mpi);
5599                                 dev->port[i].mp.mpi = NULL;
5600                         } else {
5601                                 mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
5602                                 mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
5603                         }
5604                 }
5605         }
5606
5607         mlx5_ib_dbg(dev, "removing from devlist\n");
5608         list_del(&dev->ib_dev_list);
5609         mutex_unlock(&mlx5_ib_multiport_mutex);
5610
5611         mlx5_nic_vport_disable_roce(dev->mdev);
5612 }
5613
5614 ADD_UVERBS_ATTRIBUTES_SIMPLE(
5615         mlx5_ib_dm,
5616         UVERBS_OBJECT_DM,
5617         UVERBS_METHOD_DM_ALLOC,
5618         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
5619                             UVERBS_ATTR_TYPE(u64),
5620                             UA_MANDATORY),
5621         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
5622                             UVERBS_ATTR_TYPE(u16),
5623                             UA_MANDATORY));
5624
5625 ADD_UVERBS_ATTRIBUTES_SIMPLE(
5626         mlx5_ib_flow_action,
5627         UVERBS_OBJECT_FLOW_ACTION,
5628         UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
5629         UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
5630                              enum mlx5_ib_uapi_flow_action_flags));
5631
5632 static const struct uapi_definition mlx5_ib_defs[] = {
5633 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
5634         UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
5635         UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
5636 #endif
5637
5638         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
5639                                 &mlx5_ib_flow_action),
5640         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
5641         {}
5642 };
5643
5644 static int mlx5_ib_read_counters(struct ib_counters *counters,
5645                                  struct ib_counters_read_attr *read_attr,
5646                                  struct uverbs_attr_bundle *attrs)
5647 {
5648         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
5649         struct mlx5_read_counters_attr mread_attr = {};
5650         struct mlx5_ib_flow_counters_desc *desc;
5651         int ret, i;
5652
5653         mutex_lock(&mcounters->mcntrs_mutex);
5654         if (mcounters->cntrs_max_index > read_attr->ncounters) {
5655                 ret = -EINVAL;
5656                 goto err_bound;
5657         }
5658
5659         mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
5660                                  GFP_KERNEL);
5661         if (!mread_attr.out) {
5662                 ret = -ENOMEM;
5663                 goto err_bound;
5664         }
5665
5666         mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
5667         mread_attr.flags = read_attr->flags;
5668         ret = mcounters->read_counters(counters->device, &mread_attr);
5669         if (ret)
5670                 goto err_read;
5671
5672         /* do the pass over the counters data array to assign according to the
5673          * descriptions and indexing pairs
5674          */
5675         desc = mcounters->counters_data;
5676         for (i = 0; i < mcounters->ncounters; i++)
5677                 read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
5678
5679 err_read:
5680         kfree(mread_attr.out);
5681 err_bound:
5682         mutex_unlock(&mcounters->mcntrs_mutex);
5683         return ret;
5684 }
5685
5686 static int mlx5_ib_destroy_counters(struct ib_counters *counters)
5687 {
5688         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
5689
5690         counters_clear_description(counters);
5691         if (mcounters->hw_cntrs_hndl)
5692                 mlx5_fc_destroy(to_mdev(counters->device)->mdev,
5693                                 mcounters->hw_cntrs_hndl);
5694
5695         kfree(mcounters);
5696
5697         return 0;
5698 }
5699
5700 static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
5701                                                    struct uverbs_attr_bundle *attrs)
5702 {
5703         struct mlx5_ib_mcounters *mcounters;
5704
5705         mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL);
5706         if (!mcounters)
5707                 return ERR_PTR(-ENOMEM);
5708
5709         mutex_init(&mcounters->mcntrs_mutex);
5710
5711         return &mcounters->ibcntrs;
5712 }
5713
5714 void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
5715 {
5716         mlx5_ib_cleanup_multiport_master(dev);
5717 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
5718         cleanup_srcu_struct(&dev->mr_srcu);
5719         drain_workqueue(dev->advise_mr_wq);
5720         destroy_workqueue(dev->advise_mr_wq);
5721 #endif
5722         kfree(dev->port);
5723 }
5724
5725 int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
5726 {
5727         struct mlx5_core_dev *mdev = dev->mdev;
5728         int err;
5729         int i;
5730
5731         dev->port = kcalloc(dev->num_ports, sizeof(*dev->port),
5732                             GFP_KERNEL);
5733         if (!dev->port)
5734                 return -ENOMEM;
5735
5736         for (i = 0; i < dev->num_ports; i++) {
5737                 spin_lock_init(&dev->port[i].mp.mpi_lock);
5738                 rwlock_init(&dev->roce[i].netdev_lock);
5739         }
5740
5741         err = mlx5_ib_init_multiport_master(dev);
5742         if (err)
5743                 goto err_free_port;
5744
5745         if (!mlx5_core_mp_enabled(mdev)) {
5746                 for (i = 1; i <= dev->num_ports; i++) {
5747                         err = get_port_caps(dev, i);
5748                         if (err)
5749                                 break;
5750                 }
5751         } else {
5752                 err = get_port_caps(dev, mlx5_core_native_port_num(mdev));
5753         }
5754         if (err)
5755                 goto err_mp;
5756
5757         if (mlx5_use_mad_ifc(dev))
5758                 get_ext_port_caps(dev);
5759
5760         dev->ib_dev.owner               = THIS_MODULE;
5761         dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
5762         dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
5763         dev->ib_dev.phys_port_cnt       = dev->num_ports;
5764         dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
5765         dev->ib_dev.dev.parent          = &mdev->pdev->dev;
5766
5767         mutex_init(&dev->cap_mask_mutex);
5768         INIT_LIST_HEAD(&dev->qp_list);
5769         spin_lock_init(&dev->reset_flow_resource_lock);
5770
5771         spin_lock_init(&dev->memic.memic_lock);
5772         dev->memic.dev = mdev;
5773
5774 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
5775         dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
5776         if (!dev->advise_mr_wq) {
5777                 err = -ENOMEM;
5778                 goto err_mp;
5779         }
5780
5781         err = init_srcu_struct(&dev->mr_srcu);
5782         if (err) {
5783                 destroy_workqueue(dev->advise_mr_wq);
5784                 goto err_mp;
5785         }
5786 #endif
5787
5788         return 0;
5789 err_mp:
5790         mlx5_ib_cleanup_multiport_master(dev);
5791
5792 err_free_port:
5793         kfree(dev->port);
5794
5795         return -ENOMEM;
5796 }
5797
5798 static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
5799 {
5800         dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
5801
5802         if (!dev->flow_db)
5803                 return -ENOMEM;
5804
5805         mutex_init(&dev->flow_db->lock);
5806
5807         return 0;
5808 }
5809
5810 int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev)
5811 {
5812         struct mlx5_ib_dev *nic_dev;
5813
5814         nic_dev = mlx5_ib_get_uplink_ibdev(dev->mdev->priv.eswitch);
5815
5816         if (!nic_dev)
5817                 return -EINVAL;
5818
5819         dev->flow_db = nic_dev->flow_db;
5820
5821         return 0;
5822 }
5823
5824 static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
5825 {
5826         kfree(dev->flow_db);
5827 }
5828
5829 static const struct ib_device_ops mlx5_ib_dev_ops = {
5830         .add_gid = mlx5_ib_add_gid,
5831         .alloc_mr = mlx5_ib_alloc_mr,
5832         .alloc_pd = mlx5_ib_alloc_pd,
5833         .alloc_ucontext = mlx5_ib_alloc_ucontext,
5834         .attach_mcast = mlx5_ib_mcg_attach,
5835         .check_mr_status = mlx5_ib_check_mr_status,
5836         .create_ah = mlx5_ib_create_ah,
5837         .create_counters = mlx5_ib_create_counters,
5838         .create_cq = mlx5_ib_create_cq,
5839         .create_flow = mlx5_ib_create_flow,
5840         .create_qp = mlx5_ib_create_qp,
5841         .create_srq = mlx5_ib_create_srq,
5842         .dealloc_pd = mlx5_ib_dealloc_pd,
5843         .dealloc_ucontext = mlx5_ib_dealloc_ucontext,
5844         .del_gid = mlx5_ib_del_gid,
5845         .dereg_mr = mlx5_ib_dereg_mr,
5846         .destroy_ah = mlx5_ib_destroy_ah,
5847         .destroy_counters = mlx5_ib_destroy_counters,
5848         .destroy_cq = mlx5_ib_destroy_cq,
5849         .destroy_flow = mlx5_ib_destroy_flow,
5850         .destroy_flow_action = mlx5_ib_destroy_flow_action,
5851         .destroy_qp = mlx5_ib_destroy_qp,
5852         .destroy_srq = mlx5_ib_destroy_srq,
5853         .detach_mcast = mlx5_ib_mcg_detach,
5854         .disassociate_ucontext = mlx5_ib_disassociate_ucontext,
5855         .drain_rq = mlx5_ib_drain_rq,
5856         .drain_sq = mlx5_ib_drain_sq,
5857         .get_dev_fw_str = get_dev_fw_str,
5858         .get_dma_mr = mlx5_ib_get_dma_mr,
5859         .get_link_layer = mlx5_ib_port_link_layer,
5860         .map_mr_sg = mlx5_ib_map_mr_sg,
5861         .mmap = mlx5_ib_mmap,
5862         .modify_cq = mlx5_ib_modify_cq,
5863         .modify_device = mlx5_ib_modify_device,
5864         .modify_port = mlx5_ib_modify_port,
5865         .modify_qp = mlx5_ib_modify_qp,
5866         .modify_srq = mlx5_ib_modify_srq,
5867         .poll_cq = mlx5_ib_poll_cq,
5868         .post_recv = mlx5_ib_post_recv,
5869         .post_send = mlx5_ib_post_send,
5870         .post_srq_recv = mlx5_ib_post_srq_recv,
5871         .process_mad = mlx5_ib_process_mad,
5872         .query_ah = mlx5_ib_query_ah,
5873         .query_device = mlx5_ib_query_device,
5874         .query_gid = mlx5_ib_query_gid,
5875         .query_pkey = mlx5_ib_query_pkey,
5876         .query_qp = mlx5_ib_query_qp,
5877         .query_srq = mlx5_ib_query_srq,
5878         .read_counters = mlx5_ib_read_counters,
5879         .reg_user_mr = mlx5_ib_reg_user_mr,
5880         .req_notify_cq = mlx5_ib_arm_cq,
5881         .rereg_user_mr = mlx5_ib_rereg_user_mr,
5882         .resize_cq = mlx5_ib_resize_cq,
5883 };
5884
5885 static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
5886         .create_flow_action_esp = mlx5_ib_create_flow_action_esp,
5887         .modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
5888 };
5889
5890 static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
5891         .rdma_netdev_get_params = mlx5_ib_rn_get_params,
5892 };
5893
5894 static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
5895         .get_vf_config = mlx5_ib_get_vf_config,
5896         .get_vf_stats = mlx5_ib_get_vf_stats,
5897         .set_vf_guid = mlx5_ib_set_vf_guid,
5898         .set_vf_link_state = mlx5_ib_set_vf_link_state,
5899 };
5900
5901 static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
5902         .alloc_mw = mlx5_ib_alloc_mw,
5903         .dealloc_mw = mlx5_ib_dealloc_mw,
5904 };
5905
5906 static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
5907         .alloc_xrcd = mlx5_ib_alloc_xrcd,
5908         .dealloc_xrcd = mlx5_ib_dealloc_xrcd,
5909 };
5910
5911 static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
5912         .alloc_dm = mlx5_ib_alloc_dm,
5913         .dealloc_dm = mlx5_ib_dealloc_dm,
5914         .reg_dm_mr = mlx5_ib_reg_dm_mr,
5915 };
5916
5917 int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
5918 {
5919         struct mlx5_core_dev *mdev = dev->mdev;
5920         int err;
5921
5922         dev->ib_dev.uverbs_abi_ver      = MLX5_IB_UVERBS_ABI_VERSION;
5923         dev->ib_dev.uverbs_cmd_mask     =
5924                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
5925                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
5926                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
5927                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
5928                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
5929                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
5930                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
5931                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
5932                 (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
5933                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
5934                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
5935                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
5936                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
5937                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
5938                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
5939                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
5940                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
5941                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
5942                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
5943                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
5944                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
5945                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
5946                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
5947                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
5948                 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
5949                 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
5950         dev->ib_dev.uverbs_ex_cmd_mask =
5951                 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
5952                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
5953                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
5954                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)        |
5955                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ)        |
5956                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW)      |
5957                 (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
5958
5959         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
5960             IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
5961                 ib_set_device_ops(&dev->ib_dev,
5962                                   &mlx5_ib_dev_ipoib_enhanced_ops);
5963
5964         if (mlx5_core_is_pf(mdev))
5965                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
5966
5967         dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
5968
5969         if (MLX5_CAP_GEN(mdev, imaicl)) {
5970                 dev->ib_dev.uverbs_cmd_mask |=
5971                         (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
5972                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
5973                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
5974         }
5975
5976         if (MLX5_CAP_GEN(mdev, xrc)) {
5977                 dev->ib_dev.uverbs_cmd_mask |=
5978                         (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
5979                         (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
5980                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
5981         }
5982
5983         if (MLX5_CAP_DEV_MEM(mdev, memic))
5984                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
5985
5986         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
5987             MLX5_ACCEL_IPSEC_CAP_DEVICE)
5988                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
5989         dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
5990         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
5991
5992         if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
5993                 dev->ib_dev.driver_def = mlx5_ib_defs;
5994
5995         err = init_node_data(dev);
5996         if (err)
5997                 return err;
5998
5999         if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
6000             (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
6001              MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
6002                 mutex_init(&dev->lb.mutex);
6003
6004         return 0;
6005 }
6006
6007 static const struct ib_device_ops mlx5_ib_dev_port_ops = {
6008         .get_port_immutable = mlx5_port_immutable,
6009         .query_port = mlx5_ib_query_port,
6010 };
6011
6012 static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
6013 {
6014         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
6015         return 0;
6016 }
6017
6018 static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
6019         .get_port_immutable = mlx5_port_rep_immutable,
6020         .query_port = mlx5_ib_rep_query_port,
6021 };
6022
6023 int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
6024 {
6025         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
6026         return 0;
6027 }
6028
6029 static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
6030         .create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
6031         .create_wq = mlx5_ib_create_wq,
6032         .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
6033         .destroy_wq = mlx5_ib_destroy_wq,
6034         .get_netdev = mlx5_ib_get_netdev,
6035         .modify_wq = mlx5_ib_modify_wq,
6036 };
6037
6038 static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
6039 {
6040         u8 port_num;
6041         int i;
6042
6043         for (i = 0; i < dev->num_ports; i++) {
6044                 dev->roce[i].dev = dev;
6045                 dev->roce[i].native_port_num = i + 1;
6046                 dev->roce[i].last_port_state = IB_PORT_DOWN;
6047         }
6048
6049         dev->ib_dev.uverbs_ex_cmd_mask |=
6050                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
6051                         (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
6052                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
6053                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
6054                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
6055         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
6056
6057         port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6058
6059         return mlx5_add_netdev_notifier(dev, port_num);
6060 }
6061
6062 static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
6063 {
6064         u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6065
6066         mlx5_remove_netdev_notifier(dev, port_num);
6067 }
6068
6069 int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
6070 {
6071         struct mlx5_core_dev *mdev = dev->mdev;
6072         enum rdma_link_layer ll;
6073         int port_type_cap;
6074         int err = 0;
6075
6076         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6077         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6078
6079         if (ll == IB_LINK_LAYER_ETHERNET)
6080                 err = mlx5_ib_stage_common_roce_init(dev);
6081
6082         return err;
6083 }
6084
6085 void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
6086 {
6087         mlx5_ib_stage_common_roce_cleanup(dev);
6088 }
6089
6090 static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
6091 {
6092         struct mlx5_core_dev *mdev = dev->mdev;
6093         enum rdma_link_layer ll;
6094         int port_type_cap;
6095         int err;
6096
6097         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6098         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6099
6100         if (ll == IB_LINK_LAYER_ETHERNET) {
6101                 err = mlx5_ib_stage_common_roce_init(dev);
6102                 if (err)
6103                         return err;
6104
6105                 err = mlx5_enable_eth(dev);
6106                 if (err)
6107                         goto cleanup;
6108         }
6109
6110         return 0;
6111 cleanup:
6112         mlx5_ib_stage_common_roce_cleanup(dev);
6113
6114         return err;
6115 }
6116
6117 static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
6118 {
6119         struct mlx5_core_dev *mdev = dev->mdev;
6120         enum rdma_link_layer ll;
6121         int port_type_cap;
6122
6123         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6124         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6125
6126         if (ll == IB_LINK_LAYER_ETHERNET) {
6127                 mlx5_disable_eth(dev);
6128                 mlx5_ib_stage_common_roce_cleanup(dev);
6129         }
6130 }
6131
6132 int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
6133 {
6134         return create_dev_resources(&dev->devr);
6135 }
6136
6137 void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
6138 {
6139         destroy_dev_resources(&dev->devr);
6140 }
6141
6142 static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
6143 {
6144         mlx5_ib_internal_fill_odp_caps(dev);
6145
6146         return mlx5_ib_odp_init_one(dev);
6147 }
6148
6149 void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
6150 {
6151         mlx5_ib_odp_cleanup_one(dev);
6152 }
6153
6154 static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
6155         .alloc_hw_stats = mlx5_ib_alloc_hw_stats,
6156         .get_hw_stats = mlx5_ib_get_hw_stats,
6157 };
6158
6159 int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
6160 {
6161         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
6162                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
6163
6164                 return mlx5_ib_alloc_counters(dev);
6165         }
6166
6167         return 0;
6168 }
6169
6170 void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
6171 {
6172         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
6173                 mlx5_ib_dealloc_counters(dev);
6174 }
6175
6176 static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
6177 {
6178         return mlx5_ib_init_cong_debugfs(dev,
6179                                          mlx5_core_native_port_num(dev->mdev) - 1);
6180 }
6181
6182 static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
6183 {
6184         mlx5_ib_cleanup_cong_debugfs(dev,
6185                                      mlx5_core_native_port_num(dev->mdev) - 1);
6186 }
6187
6188 static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
6189 {
6190         dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
6191         return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
6192 }
6193
6194 static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
6195 {
6196         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
6197 }
6198
6199 int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
6200 {
6201         int err;
6202
6203         err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
6204         if (err)
6205                 return err;
6206
6207         err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
6208         if (err)
6209                 mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
6210
6211         return err;
6212 }
6213
6214 void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
6215 {
6216         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
6217         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
6218 }
6219
6220 int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
6221 {
6222         const char *name;
6223
6224         rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group);
6225         if (!mlx5_lag_is_active(dev->mdev))
6226                 name = "mlx5_%d";
6227         else
6228                 name = "mlx5_bond_%d";
6229         return ib_register_device(&dev->ib_dev, name, NULL);
6230 }
6231
6232 void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
6233 {
6234         destroy_umrc_res(dev);
6235 }
6236
6237 void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
6238 {
6239         ib_unregister_device(&dev->ib_dev);
6240 }
6241
6242 int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
6243 {
6244         return create_umr_res(dev);
6245 }
6246
6247 static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
6248 {
6249         init_delay_drop(dev);
6250
6251         return 0;
6252 }
6253
6254 static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
6255 {
6256         cancel_delay_drop(dev);
6257 }
6258
6259 static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
6260 {
6261         dev->mdev_events.notifier_call = mlx5_ib_event;
6262         mlx5_notifier_register(dev->mdev, &dev->mdev_events);
6263         return 0;
6264 }
6265
6266 static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
6267 {
6268         mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
6269 }
6270
6271 static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
6272 {
6273         int uid;
6274
6275         uid = mlx5_ib_devx_create(dev, false);
6276         if (uid > 0)
6277                 dev->devx_whitelist_uid = uid;
6278
6279         return 0;
6280 }
6281 static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
6282 {
6283         if (dev->devx_whitelist_uid)
6284                 mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
6285 }
6286
6287 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
6288                       const struct mlx5_ib_profile *profile,
6289                       int stage)
6290 {
6291         /* Number of stages to cleanup */
6292         while (stage) {
6293                 stage--;
6294                 if (profile->stage[stage].cleanup)
6295                         profile->stage[stage].cleanup(dev);
6296         }
6297 }
6298
6299 void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
6300                     const struct mlx5_ib_profile *profile)
6301 {
6302         int err;
6303         int i;
6304
6305         for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
6306                 if (profile->stage[i].init) {
6307                         err = profile->stage[i].init(dev);
6308                         if (err)
6309                                 goto err_out;
6310                 }
6311         }
6312
6313         dev->profile = profile;
6314         dev->ib_active = true;
6315
6316         return dev;
6317
6318 err_out:
6319         __mlx5_ib_remove(dev, profile, i);
6320
6321         return NULL;
6322 }
6323
6324 static const struct mlx5_ib_profile pf_profile = {
6325         STAGE_CREATE(MLX5_IB_STAGE_INIT,
6326                      mlx5_ib_stage_init_init,
6327                      mlx5_ib_stage_init_cleanup),
6328         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
6329                      mlx5_ib_stage_flow_db_init,
6330                      mlx5_ib_stage_flow_db_cleanup),
6331         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
6332                      mlx5_ib_stage_caps_init,
6333                      NULL),
6334         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
6335                      mlx5_ib_stage_non_default_cb,
6336                      NULL),
6337         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
6338                      mlx5_ib_stage_roce_init,
6339                      mlx5_ib_stage_roce_cleanup),
6340         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
6341                      mlx5_init_srq_table,
6342                      mlx5_cleanup_srq_table),
6343         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
6344                      mlx5_ib_stage_dev_res_init,
6345                      mlx5_ib_stage_dev_res_cleanup),
6346         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
6347                      mlx5_ib_stage_dev_notifier_init,
6348                      mlx5_ib_stage_dev_notifier_cleanup),
6349         STAGE_CREATE(MLX5_IB_STAGE_ODP,
6350                      mlx5_ib_stage_odp_init,
6351                      mlx5_ib_stage_odp_cleanup),
6352         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
6353                      mlx5_ib_stage_counters_init,
6354                      mlx5_ib_stage_counters_cleanup),
6355         STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
6356                      mlx5_ib_stage_cong_debugfs_init,
6357                      mlx5_ib_stage_cong_debugfs_cleanup),
6358         STAGE_CREATE(MLX5_IB_STAGE_UAR,
6359                      mlx5_ib_stage_uar_init,
6360                      mlx5_ib_stage_uar_cleanup),
6361         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
6362                      mlx5_ib_stage_bfrag_init,
6363                      mlx5_ib_stage_bfrag_cleanup),
6364         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
6365                      NULL,
6366                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
6367         STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
6368                      mlx5_ib_stage_devx_init,
6369                      mlx5_ib_stage_devx_cleanup),
6370         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
6371                      mlx5_ib_stage_ib_reg_init,
6372                      mlx5_ib_stage_ib_reg_cleanup),
6373         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
6374                      mlx5_ib_stage_post_ib_reg_umr_init,
6375                      NULL),
6376         STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
6377                      mlx5_ib_stage_delay_drop_init,
6378                      mlx5_ib_stage_delay_drop_cleanup),
6379 };
6380
6381 static const struct mlx5_ib_profile nic_rep_profile = {
6382         STAGE_CREATE(MLX5_IB_STAGE_INIT,
6383                      mlx5_ib_stage_init_init,
6384                      mlx5_ib_stage_init_cleanup),
6385         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
6386                      mlx5_ib_stage_flow_db_init,
6387                      mlx5_ib_stage_flow_db_cleanup),
6388         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
6389                      mlx5_ib_stage_caps_init,
6390                      NULL),
6391         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
6392                      mlx5_ib_stage_rep_non_default_cb,
6393                      NULL),
6394         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
6395                      mlx5_ib_stage_rep_roce_init,
6396                      mlx5_ib_stage_rep_roce_cleanup),
6397         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
6398                      mlx5_init_srq_table,
6399                      mlx5_cleanup_srq_table),
6400         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
6401                      mlx5_ib_stage_dev_res_init,
6402                      mlx5_ib_stage_dev_res_cleanup),
6403         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
6404                      mlx5_ib_stage_dev_notifier_init,
6405                      mlx5_ib_stage_dev_notifier_cleanup),
6406         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
6407                      mlx5_ib_stage_counters_init,
6408                      mlx5_ib_stage_counters_cleanup),
6409         STAGE_CREATE(MLX5_IB_STAGE_UAR,
6410                      mlx5_ib_stage_uar_init,
6411                      mlx5_ib_stage_uar_cleanup),
6412         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
6413                      mlx5_ib_stage_bfrag_init,
6414                      mlx5_ib_stage_bfrag_cleanup),
6415         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
6416                      NULL,
6417                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
6418         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
6419                      mlx5_ib_stage_ib_reg_init,
6420                      mlx5_ib_stage_ib_reg_cleanup),
6421         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
6422                      mlx5_ib_stage_post_ib_reg_umr_init,
6423                      NULL),
6424 };
6425
6426 static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
6427 {
6428         struct mlx5_ib_multiport_info *mpi;
6429         struct mlx5_ib_dev *dev;
6430         bool bound = false;
6431         int err;
6432
6433         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
6434         if (!mpi)
6435                 return NULL;
6436
6437         mpi->mdev = mdev;
6438
6439         err = mlx5_query_nic_vport_system_image_guid(mdev,
6440                                                      &mpi->sys_image_guid);
6441         if (err) {
6442                 kfree(mpi);
6443                 return NULL;
6444         }
6445
6446         mutex_lock(&mlx5_ib_multiport_mutex);
6447         list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
6448                 if (dev->sys_image_guid == mpi->sys_image_guid)
6449                         bound = mlx5_ib_bind_slave_port(dev, mpi);
6450
6451                 if (bound) {
6452                         rdma_roce_rescan_device(&dev->ib_dev);
6453                         break;
6454                 }
6455         }
6456
6457         if (!bound) {
6458                 list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
6459                 dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n");
6460         }
6461         mutex_unlock(&mlx5_ib_multiport_mutex);
6462
6463         return mpi;
6464 }
6465
6466 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
6467 {
6468         enum rdma_link_layer ll;
6469         struct mlx5_ib_dev *dev;
6470         int port_type_cap;
6471
6472         printk_once(KERN_INFO "%s", mlx5_version);
6473
6474         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6475         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6476
6477         if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
6478                 return mlx5_ib_add_slave_port(mdev);
6479
6480         dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
6481         if (!dev)
6482                 return NULL;
6483
6484         dev->mdev = mdev;
6485         dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
6486                              MLX5_CAP_GEN(mdev, num_vhca_ports));
6487
6488         if (MLX5_ESWITCH_MANAGER(mdev) &&
6489             mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
6490                 dev->rep = mlx5_ib_vport_rep(mdev->priv.eswitch, 0);
6491                 dev->profile = &nic_rep_profile;
6492                 mlx5_ib_register_vport_reps(dev);
6493                 return dev;
6494         }
6495
6496         return __mlx5_ib_add(dev, &pf_profile);
6497 }
6498
6499 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
6500 {
6501         struct mlx5_ib_multiport_info *mpi;
6502         struct mlx5_ib_dev *dev;
6503
6504         if (mlx5_core_is_mp_slave(mdev)) {
6505                 mpi = context;
6506                 mutex_lock(&mlx5_ib_multiport_mutex);
6507                 if (mpi->ibdev)
6508                         mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
6509                 list_del(&mpi->list);
6510                 mutex_unlock(&mlx5_ib_multiport_mutex);
6511                 return;
6512         }
6513
6514         dev = context;
6515         if (dev->profile == &nic_rep_profile)
6516                 mlx5_ib_unregister_vport_reps(dev);
6517         else
6518                 __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
6519
6520         ib_dealloc_device((struct ib_device *)dev);
6521 }
6522
6523 static struct mlx5_interface mlx5_ib_interface = {
6524         .add            = mlx5_ib_add,
6525         .remove         = mlx5_ib_remove,
6526         .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
6527 };
6528
6529 unsigned long mlx5_ib_get_xlt_emergency_page(void)
6530 {
6531         mutex_lock(&xlt_emergency_page_mutex);
6532         return xlt_emergency_page;
6533 }
6534
6535 void mlx5_ib_put_xlt_emergency_page(void)
6536 {
6537         mutex_unlock(&xlt_emergency_page_mutex);
6538 }
6539
6540 static int __init mlx5_ib_init(void)
6541 {
6542         int err;
6543
6544         xlt_emergency_page = __get_free_page(GFP_KERNEL);
6545         if (!xlt_emergency_page)
6546                 return -ENOMEM;
6547
6548         mutex_init(&xlt_emergency_page_mutex);
6549
6550         mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
6551         if (!mlx5_ib_event_wq) {
6552                 free_page(xlt_emergency_page);
6553                 return -ENOMEM;
6554         }
6555
6556         mlx5_ib_odp_init();
6557
6558         err = mlx5_register_interface(&mlx5_ib_interface);
6559
6560         return err;
6561 }
6562
6563 static void __exit mlx5_ib_cleanup(void)
6564 {
6565         mlx5_unregister_interface(&mlx5_ib_interface);
6566         destroy_workqueue(mlx5_ib_event_wq);
6567         mutex_destroy(&xlt_emergency_page_mutex);
6568         free_page(xlt_emergency_page);
6569 }
6570
6571 module_init(mlx5_ib_init);
6572 module_exit(mlx5_ib_cleanup);