IB/mlx5: Add implicit MR support
[linux-2.6-block.git] / drivers / infiniband / hw / mlx5 / main.c
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/highmem.h>
34 #include <linux/module.h>
35 #include <linux/init.h>
36 #include <linux/errno.h>
37 #include <linux/pci.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/slab.h>
40 #if defined(CONFIG_X86)
41 #include <asm/pat.h>
42 #endif
43 #include <linux/sched.h>
44 #include <linux/delay.h>
45 #include <rdma/ib_user_verbs.h>
46 #include <rdma/ib_addr.h>
47 #include <rdma/ib_cache.h>
48 #include <linux/mlx5/port.h>
49 #include <linux/mlx5/vport.h>
50 #include <linux/list.h>
51 #include <rdma/ib_smi.h>
52 #include <rdma/ib_umem.h>
53 #include <linux/in.h>
54 #include <linux/etherdevice.h>
55 #include <linux/mlx5/fs.h>
56 #include "mlx5_ib.h"
57
58 #define DRIVER_NAME "mlx5_ib"
59 #define DRIVER_VERSION "2.2-1"
60 #define DRIVER_RELDATE  "Feb 2014"
61
62 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
63 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
64 MODULE_LICENSE("Dual BSD/GPL");
65 MODULE_VERSION(DRIVER_VERSION);
66
67 static char mlx5_version[] =
68         DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
69         DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
70
71 enum {
72         MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
73 };
74
75 static enum rdma_link_layer
76 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
77 {
78         switch (port_type_cap) {
79         case MLX5_CAP_PORT_TYPE_IB:
80                 return IB_LINK_LAYER_INFINIBAND;
81         case MLX5_CAP_PORT_TYPE_ETH:
82                 return IB_LINK_LAYER_ETHERNET;
83         default:
84                 return IB_LINK_LAYER_UNSPECIFIED;
85         }
86 }
87
88 static enum rdma_link_layer
89 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
90 {
91         struct mlx5_ib_dev *dev = to_mdev(device);
92         int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
93
94         return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
95 }
96
97 static int mlx5_netdev_event(struct notifier_block *this,
98                              unsigned long event, void *ptr)
99 {
100         struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
101         struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
102                                                  roce.nb);
103
104         switch (event) {
105         case NETDEV_REGISTER:
106         case NETDEV_UNREGISTER:
107                 write_lock(&ibdev->roce.netdev_lock);
108                 if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
109                         ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ?
110                                              NULL : ndev;
111                 write_unlock(&ibdev->roce.netdev_lock);
112                 break;
113
114         case NETDEV_UP:
115         case NETDEV_DOWN: {
116                 struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
117                 struct net_device *upper = NULL;
118
119                 if (lag_ndev) {
120                         upper = netdev_master_upper_dev_get(lag_ndev);
121                         dev_put(lag_ndev);
122                 }
123
124                 if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
125                     && ibdev->ib_active) {
126                         struct ib_event ibev = { };
127
128                         ibev.device = &ibdev->ib_dev;
129                         ibev.event = (event == NETDEV_UP) ?
130                                      IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
131                         ibev.element.port_num = 1;
132                         ib_dispatch_event(&ibev);
133                 }
134                 break;
135         }
136
137         default:
138                 break;
139         }
140
141         return NOTIFY_DONE;
142 }
143
144 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
145                                              u8 port_num)
146 {
147         struct mlx5_ib_dev *ibdev = to_mdev(device);
148         struct net_device *ndev;
149
150         ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
151         if (ndev)
152                 return ndev;
153
154         /* Ensure ndev does not disappear before we invoke dev_hold()
155          */
156         read_lock(&ibdev->roce.netdev_lock);
157         ndev = ibdev->roce.netdev;
158         if (ndev)
159                 dev_hold(ndev);
160         read_unlock(&ibdev->roce.netdev_lock);
161
162         return ndev;
163 }
164
165 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
166                                 struct ib_port_attr *props)
167 {
168         struct mlx5_ib_dev *dev = to_mdev(device);
169         struct net_device *ndev, *upper;
170         enum ib_mtu ndev_ib_mtu;
171         u16 qkey_viol_cntr;
172
173         memset(props, 0, sizeof(*props));
174
175         props->port_cap_flags  |= IB_PORT_CM_SUP;
176         props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
177
178         props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
179                                                 roce_address_table_size);
180         props->max_mtu          = IB_MTU_4096;
181         props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
182         props->pkey_tbl_len     = 1;
183         props->state            = IB_PORT_DOWN;
184         props->phys_state       = 3;
185
186         mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
187         props->qkey_viol_cntr = qkey_viol_cntr;
188
189         ndev = mlx5_ib_get_netdev(device, port_num);
190         if (!ndev)
191                 return 0;
192
193         if (mlx5_lag_is_active(dev->mdev)) {
194                 rcu_read_lock();
195                 upper = netdev_master_upper_dev_get_rcu(ndev);
196                 if (upper) {
197                         dev_put(ndev);
198                         ndev = upper;
199                         dev_hold(ndev);
200                 }
201                 rcu_read_unlock();
202         }
203
204         if (netif_running(ndev) && netif_carrier_ok(ndev)) {
205                 props->state      = IB_PORT_ACTIVE;
206                 props->phys_state = 5;
207         }
208
209         ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
210
211         dev_put(ndev);
212
213         props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
214
215         props->active_width     = IB_WIDTH_4X;  /* TODO */
216         props->active_speed     = IB_SPEED_QDR; /* TODO */
217
218         return 0;
219 }
220
221 static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
222                                      const struct ib_gid_attr *attr,
223                                      void *mlx5_addr)
224 {
225 #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
226         char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
227                                                source_l3_address);
228         void *mlx5_addr_mac     = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
229                                                source_mac_47_32);
230
231         if (!gid)
232                 return;
233
234         ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
235
236         if (is_vlan_dev(attr->ndev)) {
237                 MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
238                 MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
239         }
240
241         switch (attr->gid_type) {
242         case IB_GID_TYPE_IB:
243                 MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
244                 break;
245         case IB_GID_TYPE_ROCE_UDP_ENCAP:
246                 MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
247                 break;
248
249         default:
250                 WARN_ON(true);
251         }
252
253         if (attr->gid_type != IB_GID_TYPE_IB) {
254                 if (ipv6_addr_v4mapped((void *)gid))
255                         MLX5_SET_RA(mlx5_addr, roce_l3_type,
256                                     MLX5_ROCE_L3_TYPE_IPV4);
257                 else
258                         MLX5_SET_RA(mlx5_addr, roce_l3_type,
259                                     MLX5_ROCE_L3_TYPE_IPV6);
260         }
261
262         if ((attr->gid_type == IB_GID_TYPE_IB) ||
263             !ipv6_addr_v4mapped((void *)gid))
264                 memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
265         else
266                 memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
267 }
268
269 static int set_roce_addr(struct ib_device *device, u8 port_num,
270                          unsigned int index,
271                          const union ib_gid *gid,
272                          const struct ib_gid_attr *attr)
273 {
274         struct mlx5_ib_dev *dev = to_mdev(device);
275         u32  in[MLX5_ST_SZ_DW(set_roce_address_in)]  = {0};
276         u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
277         void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
278         enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
279
280         if (ll != IB_LINK_LAYER_ETHERNET)
281                 return -EINVAL;
282
283         ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
284
285         MLX5_SET(set_roce_address_in, in, roce_address_index, index);
286         MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
287         return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
288 }
289
290 static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
291                            unsigned int index, const union ib_gid *gid,
292                            const struct ib_gid_attr *attr,
293                            __always_unused void **context)
294 {
295         return set_roce_addr(device, port_num, index, gid, attr);
296 }
297
298 static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
299                            unsigned int index, __always_unused void **context)
300 {
301         return set_roce_addr(device, port_num, index, NULL, NULL);
302 }
303
304 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
305                                int index)
306 {
307         struct ib_gid_attr attr;
308         union ib_gid gid;
309
310         if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
311                 return 0;
312
313         if (!attr.ndev)
314                 return 0;
315
316         dev_put(attr.ndev);
317
318         if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
319                 return 0;
320
321         return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
322 }
323
324 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
325                            int index, enum ib_gid_type *gid_type)
326 {
327         struct ib_gid_attr attr;
328         union ib_gid gid;
329         int ret;
330
331         ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr);
332         if (ret)
333                 return ret;
334
335         if (!attr.ndev)
336                 return -ENODEV;
337
338         dev_put(attr.ndev);
339
340         *gid_type = attr.gid_type;
341
342         return 0;
343 }
344
345 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
346 {
347         if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
348                 return !MLX5_CAP_GEN(dev->mdev, ib_virt);
349         return 0;
350 }
351
352 enum {
353         MLX5_VPORT_ACCESS_METHOD_MAD,
354         MLX5_VPORT_ACCESS_METHOD_HCA,
355         MLX5_VPORT_ACCESS_METHOD_NIC,
356 };
357
358 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
359 {
360         if (mlx5_use_mad_ifc(to_mdev(ibdev)))
361                 return MLX5_VPORT_ACCESS_METHOD_MAD;
362
363         if (mlx5_ib_port_link_layer(ibdev, 1) ==
364             IB_LINK_LAYER_ETHERNET)
365                 return MLX5_VPORT_ACCESS_METHOD_NIC;
366
367         return MLX5_VPORT_ACCESS_METHOD_HCA;
368 }
369
370 static void get_atomic_caps(struct mlx5_ib_dev *dev,
371                             struct ib_device_attr *props)
372 {
373         u8 tmp;
374         u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
375         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
376         u8 atomic_req_8B_endianness_mode =
377                 MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
378
379         /* Check if HW supports 8 bytes standard atomic operations and capable
380          * of host endianness respond
381          */
382         tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
383         if (((atomic_operations & tmp) == tmp) &&
384             (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
385             (atomic_req_8B_endianness_mode)) {
386                 props->atomic_cap = IB_ATOMIC_HCA;
387         } else {
388                 props->atomic_cap = IB_ATOMIC_NONE;
389         }
390 }
391
392 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
393                                         __be64 *sys_image_guid)
394 {
395         struct mlx5_ib_dev *dev = to_mdev(ibdev);
396         struct mlx5_core_dev *mdev = dev->mdev;
397         u64 tmp;
398         int err;
399
400         switch (mlx5_get_vport_access_method(ibdev)) {
401         case MLX5_VPORT_ACCESS_METHOD_MAD:
402                 return mlx5_query_mad_ifc_system_image_guid(ibdev,
403                                                             sys_image_guid);
404
405         case MLX5_VPORT_ACCESS_METHOD_HCA:
406                 err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
407                 break;
408
409         case MLX5_VPORT_ACCESS_METHOD_NIC:
410                 err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
411                 break;
412
413         default:
414                 return -EINVAL;
415         }
416
417         if (!err)
418                 *sys_image_guid = cpu_to_be64(tmp);
419
420         return err;
421
422 }
423
424 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
425                                 u16 *max_pkeys)
426 {
427         struct mlx5_ib_dev *dev = to_mdev(ibdev);
428         struct mlx5_core_dev *mdev = dev->mdev;
429
430         switch (mlx5_get_vport_access_method(ibdev)) {
431         case MLX5_VPORT_ACCESS_METHOD_MAD:
432                 return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
433
434         case MLX5_VPORT_ACCESS_METHOD_HCA:
435         case MLX5_VPORT_ACCESS_METHOD_NIC:
436                 *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
437                                                 pkey_table_size));
438                 return 0;
439
440         default:
441                 return -EINVAL;
442         }
443 }
444
445 static int mlx5_query_vendor_id(struct ib_device *ibdev,
446                                 u32 *vendor_id)
447 {
448         struct mlx5_ib_dev *dev = to_mdev(ibdev);
449
450         switch (mlx5_get_vport_access_method(ibdev)) {
451         case MLX5_VPORT_ACCESS_METHOD_MAD:
452                 return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
453
454         case MLX5_VPORT_ACCESS_METHOD_HCA:
455         case MLX5_VPORT_ACCESS_METHOD_NIC:
456                 return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
457
458         default:
459                 return -EINVAL;
460         }
461 }
462
463 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
464                                 __be64 *node_guid)
465 {
466         u64 tmp;
467         int err;
468
469         switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
470         case MLX5_VPORT_ACCESS_METHOD_MAD:
471                 return mlx5_query_mad_ifc_node_guid(dev, node_guid);
472
473         case MLX5_VPORT_ACCESS_METHOD_HCA:
474                 err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
475                 break;
476
477         case MLX5_VPORT_ACCESS_METHOD_NIC:
478                 err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
479                 break;
480
481         default:
482                 return -EINVAL;
483         }
484
485         if (!err)
486                 *node_guid = cpu_to_be64(tmp);
487
488         return err;
489 }
490
491 struct mlx5_reg_node_desc {
492         u8      desc[IB_DEVICE_NODE_DESC_MAX];
493 };
494
495 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
496 {
497         struct mlx5_reg_node_desc in;
498
499         if (mlx5_use_mad_ifc(dev))
500                 return mlx5_query_mad_ifc_node_desc(dev, node_desc);
501
502         memset(&in, 0, sizeof(in));
503
504         return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
505                                     sizeof(struct mlx5_reg_node_desc),
506                                     MLX5_REG_NODE_DESC, 0, 0);
507 }
508
509 static int mlx5_ib_query_device(struct ib_device *ibdev,
510                                 struct ib_device_attr *props,
511                                 struct ib_udata *uhw)
512 {
513         struct mlx5_ib_dev *dev = to_mdev(ibdev);
514         struct mlx5_core_dev *mdev = dev->mdev;
515         int err = -ENOMEM;
516         int max_sq_desc;
517         int max_rq_sg;
518         int max_sq_sg;
519         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
520         struct mlx5_ib_query_device_resp resp = {};
521         size_t resp_len;
522         u64 max_tso;
523
524         resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
525         if (uhw->outlen && uhw->outlen < resp_len)
526                 return -EINVAL;
527         else
528                 resp.response_length = resp_len;
529
530         if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
531                 return -EINVAL;
532
533         memset(props, 0, sizeof(*props));
534         err = mlx5_query_system_image_guid(ibdev,
535                                            &props->sys_image_guid);
536         if (err)
537                 return err;
538
539         err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
540         if (err)
541                 return err;
542
543         err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
544         if (err)
545                 return err;
546
547         props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
548                 (fw_rev_min(dev->mdev) << 16) |
549                 fw_rev_sub(dev->mdev);
550         props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
551                 IB_DEVICE_PORT_ACTIVE_EVENT             |
552                 IB_DEVICE_SYS_IMAGE_GUID                |
553                 IB_DEVICE_RC_RNR_NAK_GEN;
554
555         if (MLX5_CAP_GEN(mdev, pkv))
556                 props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
557         if (MLX5_CAP_GEN(mdev, qkv))
558                 props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
559         if (MLX5_CAP_GEN(mdev, apm))
560                 props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
561         if (MLX5_CAP_GEN(mdev, xrc))
562                 props->device_cap_flags |= IB_DEVICE_XRC;
563         if (MLX5_CAP_GEN(mdev, imaicl)) {
564                 props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
565                                            IB_DEVICE_MEM_WINDOW_TYPE_2B;
566                 props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
567                 /* We support 'Gappy' memory registration too */
568                 props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
569         }
570         props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
571         if (MLX5_CAP_GEN(mdev, sho)) {
572                 props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
573                 /* At this stage no support for signature handover */
574                 props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
575                                       IB_PROT_T10DIF_TYPE_2 |
576                                       IB_PROT_T10DIF_TYPE_3;
577                 props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
578                                        IB_GUARD_T10DIF_CSUM;
579         }
580         if (MLX5_CAP_GEN(mdev, block_lb_mc))
581                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
582
583         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
584                 if (MLX5_CAP_ETH(mdev, csum_cap)) {
585                         /* Legacy bit to support old userspace libraries */
586                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
587                         props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
588                 }
589
590                 if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
591                         props->raw_packet_caps |=
592                                 IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
593
594                 if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
595                         max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
596                         if (max_tso) {
597                                 resp.tso_caps.max_tso = 1 << max_tso;
598                                 resp.tso_caps.supported_qpts |=
599                                         1 << IB_QPT_RAW_PACKET;
600                                 resp.response_length += sizeof(resp.tso_caps);
601                         }
602                 }
603
604                 if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
605                         resp.rss_caps.rx_hash_function =
606                                                 MLX5_RX_HASH_FUNC_TOEPLITZ;
607                         resp.rss_caps.rx_hash_fields_mask =
608                                                 MLX5_RX_HASH_SRC_IPV4 |
609                                                 MLX5_RX_HASH_DST_IPV4 |
610                                                 MLX5_RX_HASH_SRC_IPV6 |
611                                                 MLX5_RX_HASH_DST_IPV6 |
612                                                 MLX5_RX_HASH_SRC_PORT_TCP |
613                                                 MLX5_RX_HASH_DST_PORT_TCP |
614                                                 MLX5_RX_HASH_SRC_PORT_UDP |
615                                                 MLX5_RX_HASH_DST_PORT_UDP;
616                         resp.response_length += sizeof(resp.rss_caps);
617                 }
618         } else {
619                 if (field_avail(typeof(resp), tso_caps, uhw->outlen))
620                         resp.response_length += sizeof(resp.tso_caps);
621                 if (field_avail(typeof(resp), rss_caps, uhw->outlen))
622                         resp.response_length += sizeof(resp.rss_caps);
623         }
624
625         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
626                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
627                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
628         }
629
630         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
631             MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
632                 /* Legacy bit to support old userspace libraries */
633                 props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
634                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
635         }
636
637         if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
638                 props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
639
640         props->vendor_part_id      = mdev->pdev->device;
641         props->hw_ver              = mdev->pdev->revision;
642
643         props->max_mr_size         = ~0ull;
644         props->page_size_cap       = ~(min_page_size - 1);
645         props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
646         props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
647         max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
648                      sizeof(struct mlx5_wqe_data_seg);
649         max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
650         max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
651                      sizeof(struct mlx5_wqe_raddr_seg)) /
652                 sizeof(struct mlx5_wqe_data_seg);
653         props->max_sge = min(max_rq_sg, max_sq_sg);
654         props->max_sge_rd          = MLX5_MAX_SGE_RD;
655         props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
656         props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
657         props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
658         props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
659         props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
660         props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
661         props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
662         props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
663         props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
664         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
665         props->max_srq_sge         = max_rq_sg - 1;
666         props->max_fast_reg_page_list_len =
667                 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
668         get_atomic_caps(dev, props);
669         props->masked_atomic_cap   = IB_ATOMIC_NONE;
670         props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
671         props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
672         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
673                                            props->max_mcast_grp;
674         props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
675         props->max_ah = INT_MAX;
676         props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
677         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
678
679 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
680         if (MLX5_CAP_GEN(mdev, pg))
681                 props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
682         props->odp_caps = dev->odp_caps;
683 #endif
684
685         if (MLX5_CAP_GEN(mdev, cd))
686                 props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
687
688         if (!mlx5_core_is_pf(mdev))
689                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
690
691         if (mlx5_ib_port_link_layer(ibdev, 1) ==
692             IB_LINK_LAYER_ETHERNET) {
693                 props->rss_caps.max_rwq_indirection_tables =
694                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
695                 props->rss_caps.max_rwq_indirection_table_size =
696                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
697                 props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
698                 props->max_wq_type_rq =
699                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
700         }
701
702         if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
703                 resp.cqe_comp_caps.max_num =
704                         MLX5_CAP_GEN(dev->mdev, cqe_compression) ?
705                         MLX5_CAP_GEN(dev->mdev, cqe_compression_max_num) : 0;
706                 resp.cqe_comp_caps.supported_format =
707                         MLX5_IB_CQE_RES_FORMAT_HASH |
708                         MLX5_IB_CQE_RES_FORMAT_CSUM;
709                 resp.response_length += sizeof(resp.cqe_comp_caps);
710         }
711
712         if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen)) {
713                 if (MLX5_CAP_QOS(mdev, packet_pacing) &&
714                     MLX5_CAP_GEN(mdev, qos)) {
715                         resp.packet_pacing_caps.qp_rate_limit_max =
716                                 MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
717                         resp.packet_pacing_caps.qp_rate_limit_min =
718                                 MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
719                         resp.packet_pacing_caps.supported_qpts |=
720                                 1 << IB_QPT_RAW_PACKET;
721                 }
722                 resp.response_length += sizeof(resp.packet_pacing_caps);
723         }
724
725         if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
726                         uhw->outlen)) {
727                 resp.mlx5_ib_support_multi_pkt_send_wqes =
728                         MLX5_CAP_ETH(mdev, multi_pkt_send_wqe);
729                 resp.response_length +=
730                         sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
731         }
732
733         if (field_avail(typeof(resp), reserved, uhw->outlen))
734                 resp.response_length += sizeof(resp.reserved);
735
736         if (uhw->outlen) {
737                 err = ib_copy_to_udata(uhw, &resp, resp.response_length);
738
739                 if (err)
740                         return err;
741         }
742
743         return 0;
744 }
745
746 enum mlx5_ib_width {
747         MLX5_IB_WIDTH_1X        = 1 << 0,
748         MLX5_IB_WIDTH_2X        = 1 << 1,
749         MLX5_IB_WIDTH_4X        = 1 << 2,
750         MLX5_IB_WIDTH_8X        = 1 << 3,
751         MLX5_IB_WIDTH_12X       = 1 << 4
752 };
753
754 static int translate_active_width(struct ib_device *ibdev, u8 active_width,
755                                   u8 *ib_width)
756 {
757         struct mlx5_ib_dev *dev = to_mdev(ibdev);
758         int err = 0;
759
760         if (active_width & MLX5_IB_WIDTH_1X) {
761                 *ib_width = IB_WIDTH_1X;
762         } else if (active_width & MLX5_IB_WIDTH_2X) {
763                 mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
764                             (int)active_width);
765                 err = -EINVAL;
766         } else if (active_width & MLX5_IB_WIDTH_4X) {
767                 *ib_width = IB_WIDTH_4X;
768         } else if (active_width & MLX5_IB_WIDTH_8X) {
769                 *ib_width = IB_WIDTH_8X;
770         } else if (active_width & MLX5_IB_WIDTH_12X) {
771                 *ib_width = IB_WIDTH_12X;
772         } else {
773                 mlx5_ib_dbg(dev, "Invalid active_width %d\n",
774                             (int)active_width);
775                 err = -EINVAL;
776         }
777
778         return err;
779 }
780
781 static int mlx5_mtu_to_ib_mtu(int mtu)
782 {
783         switch (mtu) {
784         case 256: return 1;
785         case 512: return 2;
786         case 1024: return 3;
787         case 2048: return 4;
788         case 4096: return 5;
789         default:
790                 pr_warn("invalid mtu\n");
791                 return -1;
792         }
793 }
794
795 enum ib_max_vl_num {
796         __IB_MAX_VL_0           = 1,
797         __IB_MAX_VL_0_1         = 2,
798         __IB_MAX_VL_0_3         = 3,
799         __IB_MAX_VL_0_7         = 4,
800         __IB_MAX_VL_0_14        = 5,
801 };
802
803 enum mlx5_vl_hw_cap {
804         MLX5_VL_HW_0    = 1,
805         MLX5_VL_HW_0_1  = 2,
806         MLX5_VL_HW_0_2  = 3,
807         MLX5_VL_HW_0_3  = 4,
808         MLX5_VL_HW_0_4  = 5,
809         MLX5_VL_HW_0_5  = 6,
810         MLX5_VL_HW_0_6  = 7,
811         MLX5_VL_HW_0_7  = 8,
812         MLX5_VL_HW_0_14 = 15
813 };
814
815 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
816                                 u8 *max_vl_num)
817 {
818         switch (vl_hw_cap) {
819         case MLX5_VL_HW_0:
820                 *max_vl_num = __IB_MAX_VL_0;
821                 break;
822         case MLX5_VL_HW_0_1:
823                 *max_vl_num = __IB_MAX_VL_0_1;
824                 break;
825         case MLX5_VL_HW_0_3:
826                 *max_vl_num = __IB_MAX_VL_0_3;
827                 break;
828         case MLX5_VL_HW_0_7:
829                 *max_vl_num = __IB_MAX_VL_0_7;
830                 break;
831         case MLX5_VL_HW_0_14:
832                 *max_vl_num = __IB_MAX_VL_0_14;
833                 break;
834
835         default:
836                 return -EINVAL;
837         }
838
839         return 0;
840 }
841
842 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
843                                struct ib_port_attr *props)
844 {
845         struct mlx5_ib_dev *dev = to_mdev(ibdev);
846         struct mlx5_core_dev *mdev = dev->mdev;
847         struct mlx5_hca_vport_context *rep;
848         u16 max_mtu;
849         u16 oper_mtu;
850         int err;
851         u8 ib_link_width_oper;
852         u8 vl_hw_cap;
853
854         rep = kzalloc(sizeof(*rep), GFP_KERNEL);
855         if (!rep) {
856                 err = -ENOMEM;
857                 goto out;
858         }
859
860         memset(props, 0, sizeof(*props));
861
862         err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
863         if (err)
864                 goto out;
865
866         props->lid              = rep->lid;
867         props->lmc              = rep->lmc;
868         props->sm_lid           = rep->sm_lid;
869         props->sm_sl            = rep->sm_sl;
870         props->state            = rep->vport_state;
871         props->phys_state       = rep->port_physical_state;
872         props->port_cap_flags   = rep->cap_mask1;
873         props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
874         props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
875         props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
876         props->bad_pkey_cntr    = rep->pkey_violation_counter;
877         props->qkey_viol_cntr   = rep->qkey_violation_counter;
878         props->subnet_timeout   = rep->subnet_timeout;
879         props->init_type_reply  = rep->init_type_reply;
880         props->grh_required     = rep->grh_required;
881
882         err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
883         if (err)
884                 goto out;
885
886         err = translate_active_width(ibdev, ib_link_width_oper,
887                                      &props->active_width);
888         if (err)
889                 goto out;
890         err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
891         if (err)
892                 goto out;
893
894         mlx5_query_port_max_mtu(mdev, &max_mtu, port);
895
896         props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
897
898         mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
899
900         props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
901
902         err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
903         if (err)
904                 goto out;
905
906         err = translate_max_vl_num(ibdev, vl_hw_cap,
907                                    &props->max_vl_num);
908 out:
909         kfree(rep);
910         return err;
911 }
912
913 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
914                        struct ib_port_attr *props)
915 {
916         switch (mlx5_get_vport_access_method(ibdev)) {
917         case MLX5_VPORT_ACCESS_METHOD_MAD:
918                 return mlx5_query_mad_ifc_port(ibdev, port, props);
919
920         case MLX5_VPORT_ACCESS_METHOD_HCA:
921                 return mlx5_query_hca_port(ibdev, port, props);
922
923         case MLX5_VPORT_ACCESS_METHOD_NIC:
924                 return mlx5_query_port_roce(ibdev, port, props);
925
926         default:
927                 return -EINVAL;
928         }
929 }
930
931 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
932                              union ib_gid *gid)
933 {
934         struct mlx5_ib_dev *dev = to_mdev(ibdev);
935         struct mlx5_core_dev *mdev = dev->mdev;
936
937         switch (mlx5_get_vport_access_method(ibdev)) {
938         case MLX5_VPORT_ACCESS_METHOD_MAD:
939                 return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
940
941         case MLX5_VPORT_ACCESS_METHOD_HCA:
942                 return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
943
944         default:
945                 return -EINVAL;
946         }
947
948 }
949
950 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
951                               u16 *pkey)
952 {
953         struct mlx5_ib_dev *dev = to_mdev(ibdev);
954         struct mlx5_core_dev *mdev = dev->mdev;
955
956         switch (mlx5_get_vport_access_method(ibdev)) {
957         case MLX5_VPORT_ACCESS_METHOD_MAD:
958                 return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
959
960         case MLX5_VPORT_ACCESS_METHOD_HCA:
961         case MLX5_VPORT_ACCESS_METHOD_NIC:
962                 return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
963                                                  pkey);
964         default:
965                 return -EINVAL;
966         }
967 }
968
969 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
970                                  struct ib_device_modify *props)
971 {
972         struct mlx5_ib_dev *dev = to_mdev(ibdev);
973         struct mlx5_reg_node_desc in;
974         struct mlx5_reg_node_desc out;
975         int err;
976
977         if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
978                 return -EOPNOTSUPP;
979
980         if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
981                 return 0;
982
983         /*
984          * If possible, pass node desc to FW, so it can generate
985          * a 144 trap.  If cmd fails, just ignore.
986          */
987         memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
988         err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
989                                    sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
990         if (err)
991                 return err;
992
993         memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
994
995         return err;
996 }
997
998 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
999                                struct ib_port_modify *props)
1000 {
1001         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1002         struct ib_port_attr attr;
1003         u32 tmp;
1004         int err;
1005
1006         mutex_lock(&dev->cap_mask_mutex);
1007
1008         err = mlx5_ib_query_port(ibdev, port, &attr);
1009         if (err)
1010                 goto out;
1011
1012         tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1013                 ~props->clr_port_cap_mask;
1014
1015         err = mlx5_set_port_caps(dev->mdev, port, tmp);
1016
1017 out:
1018         mutex_unlock(&dev->cap_mask_mutex);
1019         return err;
1020 }
1021
1022 static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1023 {
1024         mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1025                     caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1026 }
1027
1028 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1029                              struct mlx5_ib_alloc_ucontext_req_v2 *req,
1030                              u32 *num_sys_pages)
1031 {
1032         int uars_per_sys_page;
1033         int bfregs_per_sys_page;
1034         int ref_bfregs = req->total_num_bfregs;
1035
1036         if (req->total_num_bfregs == 0)
1037                 return -EINVAL;
1038
1039         BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1040         BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1041
1042         if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1043                 return -ENOMEM;
1044
1045         uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1046         bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1047         req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1048         *num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1049
1050         if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1051                 return -EINVAL;
1052
1053         mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, alloated %d, using %d sys pages\n",
1054                     MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1055                     lib_uar_4k ? "yes" : "no", ref_bfregs,
1056                     req->total_num_bfregs, *num_sys_pages);
1057
1058         return 0;
1059 }
1060
1061 static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1062 {
1063         struct mlx5_bfreg_info *bfregi;
1064         int err;
1065         int i;
1066
1067         bfregi = &context->bfregi;
1068         for (i = 0; i < bfregi->num_sys_pages; i++) {
1069                 err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1070                 if (err)
1071                         goto error;
1072
1073                 mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1074         }
1075         return 0;
1076
1077 error:
1078         for (--i; i >= 0; i--)
1079                 if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1080                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1081
1082         return err;
1083 }
1084
1085 static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1086 {
1087         struct mlx5_bfreg_info *bfregi;
1088         int err;
1089         int i;
1090
1091         bfregi = &context->bfregi;
1092         for (i = 0; i < bfregi->num_sys_pages; i++) {
1093                 err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1094                 if (err) {
1095                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1096                         return err;
1097                 }
1098         }
1099         return 0;
1100 }
1101
1102 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
1103                                                   struct ib_udata *udata)
1104 {
1105         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1106         struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1107         struct mlx5_ib_alloc_ucontext_resp resp = {};
1108         struct mlx5_ib_ucontext *context;
1109         struct mlx5_bfreg_info *bfregi;
1110         int ver;
1111         int err;
1112         size_t reqlen;
1113         size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1114                                      max_cqe_version);
1115         bool lib_uar_4k;
1116
1117         if (!dev->ib_active)
1118                 return ERR_PTR(-EAGAIN);
1119
1120         if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
1121                 return ERR_PTR(-EINVAL);
1122
1123         reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
1124         if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1125                 ver = 0;
1126         else if (reqlen >= min_req_v2)
1127                 ver = 2;
1128         else
1129                 return ERR_PTR(-EINVAL);
1130
1131         err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
1132         if (err)
1133                 return ERR_PTR(err);
1134
1135         if (req.flags)
1136                 return ERR_PTR(-EINVAL);
1137
1138         if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1139                 return ERR_PTR(-EOPNOTSUPP);
1140
1141         req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1142                                     MLX5_NON_FP_BFREGS_PER_UAR);
1143         if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1144                 return ERR_PTR(-EINVAL);
1145
1146         resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1147         if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1148                 resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1149         resp.cache_line_size = cache_line_size();
1150         resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1151         resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1152         resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1153         resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1154         resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1155         resp.cqe_version = min_t(__u8,
1156                                  (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1157                                  req.max_cqe_version);
1158         resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1159                                 MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1160         resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1161                                         MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1162         resp.response_length = min(offsetof(typeof(resp), response_length) +
1163                                    sizeof(resp.response_length), udata->outlen);
1164
1165         context = kzalloc(sizeof(*context), GFP_KERNEL);
1166         if (!context)
1167                 return ERR_PTR(-ENOMEM);
1168
1169         lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1170         bfregi = &context->bfregi;
1171
1172         /* updates req->total_num_bfregs */
1173         err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages);
1174         if (err)
1175                 goto out_ctx;
1176
1177         mutex_init(&bfregi->lock);
1178         bfregi->lib_uar_4k = lib_uar_4k;
1179         bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count),
1180                                 GFP_KERNEL);
1181         if (!bfregi->count) {
1182                 err = -ENOMEM;
1183                 goto out_ctx;
1184         }
1185
1186         bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1187                                     sizeof(*bfregi->sys_pages),
1188                                     GFP_KERNEL);
1189         if (!bfregi->sys_pages) {
1190                 err = -ENOMEM;
1191                 goto out_count;
1192         }
1193
1194         err = allocate_uars(dev, context);
1195         if (err)
1196                 goto out_sys_pages;
1197
1198 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1199         context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
1200 #endif
1201
1202         context->upd_xlt_page = __get_free_page(GFP_KERNEL);
1203         if (!context->upd_xlt_page) {
1204                 err = -ENOMEM;
1205                 goto out_uars;
1206         }
1207         mutex_init(&context->upd_xlt_page_mutex);
1208
1209         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
1210                 err = mlx5_core_alloc_transport_domain(dev->mdev,
1211                                                        &context->tdn);
1212                 if (err)
1213                         goto out_page;
1214         }
1215
1216         INIT_LIST_HEAD(&context->vma_private_list);
1217         INIT_LIST_HEAD(&context->db_page_list);
1218         mutex_init(&context->db_page_mutex);
1219
1220         resp.tot_bfregs = req.total_num_bfregs;
1221         resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
1222
1223         if (field_avail(typeof(resp), cqe_version, udata->outlen))
1224                 resp.response_length += sizeof(resp.cqe_version);
1225
1226         if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1227                 resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1228                                       MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1229                 resp.response_length += sizeof(resp.cmds_supp_uhw);
1230         }
1231
1232         /*
1233          * We don't want to expose information from the PCI bar that is located
1234          * after 4096 bytes, so if the arch only supports larger pages, let's
1235          * pretend we don't support reading the HCA's core clock. This is also
1236          * forced by mmap function.
1237          */
1238         if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1239                 if (PAGE_SIZE <= 4096) {
1240                         resp.comp_mask |=
1241                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1242                         resp.hca_core_clock_offset =
1243                                 offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1244                 }
1245                 resp.response_length += sizeof(resp.hca_core_clock_offset) +
1246                                         sizeof(resp.reserved2);
1247         }
1248
1249         if (field_avail(typeof(resp), log_uar_size, udata->outlen))
1250                 resp.response_length += sizeof(resp.log_uar_size);
1251
1252         if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
1253                 resp.response_length += sizeof(resp.num_uars_per_page);
1254
1255         err = ib_copy_to_udata(udata, &resp, resp.response_length);
1256         if (err)
1257                 goto out_td;
1258
1259         bfregi->ver = ver;
1260         bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1261         context->cqe_version = resp.cqe_version;
1262         context->lib_caps = req.lib_caps;
1263         print_lib_caps(dev, context->lib_caps);
1264
1265         return &context->ibucontext;
1266
1267 out_td:
1268         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1269                 mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1270
1271 out_page:
1272         free_page(context->upd_xlt_page);
1273
1274 out_uars:
1275         deallocate_uars(dev, context);
1276
1277 out_sys_pages:
1278         kfree(bfregi->sys_pages);
1279
1280 out_count:
1281         kfree(bfregi->count);
1282
1283 out_ctx:
1284         kfree(context);
1285
1286         return ERR_PTR(err);
1287 }
1288
1289 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1290 {
1291         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1292         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1293         struct mlx5_bfreg_info *bfregi;
1294
1295         bfregi = &context->bfregi;
1296         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1297                 mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1298
1299         free_page(context->upd_xlt_page);
1300         deallocate_uars(dev, context);
1301         kfree(bfregi->sys_pages);
1302         kfree(bfregi->count);
1303         kfree(context);
1304
1305         return 0;
1306 }
1307
1308 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
1309                                  struct mlx5_bfreg_info *bfregi,
1310                                  int idx)
1311 {
1312         int fw_uars_per_page;
1313
1314         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
1315
1316         return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) +
1317                         bfregi->sys_pages[idx] / fw_uars_per_page;
1318 }
1319
1320 static int get_command(unsigned long offset)
1321 {
1322         return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1323 }
1324
1325 static int get_arg(unsigned long offset)
1326 {
1327         return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1328 }
1329
1330 static int get_index(unsigned long offset)
1331 {
1332         return get_arg(offset);
1333 }
1334
1335 static void  mlx5_ib_vma_open(struct vm_area_struct *area)
1336 {
1337         /* vma_open is called when a new VMA is created on top of our VMA.  This
1338          * is done through either mremap flow or split_vma (usually due to
1339          * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
1340          * as this VMA is strongly hardware related.  Therefore we set the
1341          * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
1342          * calling us again and trying to do incorrect actions.  We assume that
1343          * the original VMA size is exactly a single page, and therefore all
1344          * "splitting" operation will not happen to it.
1345          */
1346         area->vm_ops = NULL;
1347 }
1348
1349 static void  mlx5_ib_vma_close(struct vm_area_struct *area)
1350 {
1351         struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
1352
1353         /* It's guaranteed that all VMAs opened on a FD are closed before the
1354          * file itself is closed, therefore no sync is needed with the regular
1355          * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
1356          * However need a sync with accessing the vma as part of
1357          * mlx5_ib_disassociate_ucontext.
1358          * The close operation is usually called under mm->mmap_sem except when
1359          * process is exiting.
1360          * The exiting case is handled explicitly as part of
1361          * mlx5_ib_disassociate_ucontext.
1362          */
1363         mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
1364
1365         /* setting the vma context pointer to null in the mlx5_ib driver's
1366          * private data, to protect a race condition in
1367          * mlx5_ib_disassociate_ucontext().
1368          */
1369         mlx5_ib_vma_priv_data->vma = NULL;
1370         list_del(&mlx5_ib_vma_priv_data->list);
1371         kfree(mlx5_ib_vma_priv_data);
1372 }
1373
1374 static const struct vm_operations_struct mlx5_ib_vm_ops = {
1375         .open = mlx5_ib_vma_open,
1376         .close = mlx5_ib_vma_close
1377 };
1378
1379 static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
1380                                 struct mlx5_ib_ucontext *ctx)
1381 {
1382         struct mlx5_ib_vma_private_data *vma_prv;
1383         struct list_head *vma_head = &ctx->vma_private_list;
1384
1385         vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
1386         if (!vma_prv)
1387                 return -ENOMEM;
1388
1389         vma_prv->vma = vma;
1390         vma->vm_private_data = vma_prv;
1391         vma->vm_ops =  &mlx5_ib_vm_ops;
1392
1393         list_add(&vma_prv->list, vma_head);
1394
1395         return 0;
1396 }
1397
1398 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
1399 {
1400         int ret;
1401         struct vm_area_struct *vma;
1402         struct mlx5_ib_vma_private_data *vma_private, *n;
1403         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1404         struct task_struct *owning_process  = NULL;
1405         struct mm_struct   *owning_mm       = NULL;
1406
1407         owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
1408         if (!owning_process)
1409                 return;
1410
1411         owning_mm = get_task_mm(owning_process);
1412         if (!owning_mm) {
1413                 pr_info("no mm, disassociate ucontext is pending task termination\n");
1414                 while (1) {
1415                         put_task_struct(owning_process);
1416                         usleep_range(1000, 2000);
1417                         owning_process = get_pid_task(ibcontext->tgid,
1418                                                       PIDTYPE_PID);
1419                         if (!owning_process ||
1420                             owning_process->state == TASK_DEAD) {
1421                                 pr_info("disassociate ucontext done, task was terminated\n");
1422                                 /* in case task was dead need to release the
1423                                  * task struct.
1424                                  */
1425                                 if (owning_process)
1426                                         put_task_struct(owning_process);
1427                                 return;
1428                         }
1429                 }
1430         }
1431
1432         /* need to protect from a race on closing the vma as part of
1433          * mlx5_ib_vma_close.
1434          */
1435         down_read(&owning_mm->mmap_sem);
1436         list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
1437                                  list) {
1438                 vma = vma_private->vma;
1439                 ret = zap_vma_ptes(vma, vma->vm_start,
1440                                    PAGE_SIZE);
1441                 WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
1442                 /* context going to be destroyed, should
1443                  * not access ops any more.
1444                  */
1445                 vma->vm_ops = NULL;
1446                 list_del(&vma_private->list);
1447                 kfree(vma_private);
1448         }
1449         up_read(&owning_mm->mmap_sem);
1450         mmput(owning_mm);
1451         put_task_struct(owning_process);
1452 }
1453
1454 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
1455 {
1456         switch (cmd) {
1457         case MLX5_IB_MMAP_WC_PAGE:
1458                 return "WC";
1459         case MLX5_IB_MMAP_REGULAR_PAGE:
1460                 return "best effort WC";
1461         case MLX5_IB_MMAP_NC_PAGE:
1462                 return "NC";
1463         default:
1464                 return NULL;
1465         }
1466 }
1467
1468 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
1469                     struct vm_area_struct *vma,
1470                     struct mlx5_ib_ucontext *context)
1471 {
1472         struct mlx5_bfreg_info *bfregi = &context->bfregi;
1473         int err;
1474         unsigned long idx;
1475         phys_addr_t pfn, pa;
1476         pgprot_t prot;
1477         int uars_per_page;
1478
1479         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1480                 return -EINVAL;
1481
1482         uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
1483         idx = get_index(vma->vm_pgoff);
1484         if (idx % uars_per_page ||
1485             idx * uars_per_page >= bfregi->num_sys_pages) {
1486                 mlx5_ib_warn(dev, "invalid uar index %lu\n", idx);
1487                 return -EINVAL;
1488         }
1489
1490         switch (cmd) {
1491         case MLX5_IB_MMAP_WC_PAGE:
1492 /* Some architectures don't support WC memory */
1493 #if defined(CONFIG_X86)
1494                 if (!pat_enabled())
1495                         return -EPERM;
1496 #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
1497                         return -EPERM;
1498 #endif
1499         /* fall through */
1500         case MLX5_IB_MMAP_REGULAR_PAGE:
1501                 /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
1502                 prot = pgprot_writecombine(vma->vm_page_prot);
1503                 break;
1504         case MLX5_IB_MMAP_NC_PAGE:
1505                 prot = pgprot_noncached(vma->vm_page_prot);
1506                 break;
1507         default:
1508                 return -EINVAL;
1509         }
1510
1511         pfn = uar_index2pfn(dev, bfregi, idx);
1512         mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
1513
1514         vma->vm_page_prot = prot;
1515         err = io_remap_pfn_range(vma, vma->vm_start, pfn,
1516                                  PAGE_SIZE, vma->vm_page_prot);
1517         if (err) {
1518                 mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
1519                             err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
1520                 return -EAGAIN;
1521         }
1522
1523         pa = pfn << PAGE_SHIFT;
1524         mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
1525                     vma->vm_start, &pa);
1526
1527         return mlx5_ib_set_vma_data(vma, context);
1528 }
1529
1530 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1531 {
1532         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1533         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1534         unsigned long command;
1535         phys_addr_t pfn;
1536
1537         command = get_command(vma->vm_pgoff);
1538         switch (command) {
1539         case MLX5_IB_MMAP_WC_PAGE:
1540         case MLX5_IB_MMAP_NC_PAGE:
1541         case MLX5_IB_MMAP_REGULAR_PAGE:
1542                 return uar_mmap(dev, command, vma, context);
1543
1544         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1545                 return -ENOSYS;
1546
1547         case MLX5_IB_MMAP_CORE_CLOCK:
1548                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1549                         return -EINVAL;
1550
1551                 if (vma->vm_flags & VM_WRITE)
1552                         return -EPERM;
1553
1554                 /* Don't expose to user-space information it shouldn't have */
1555                 if (PAGE_SIZE > 4096)
1556                         return -EOPNOTSUPP;
1557
1558                 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1559                 pfn = (dev->mdev->iseg_base +
1560                        offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1561                         PAGE_SHIFT;
1562                 if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1563                                        PAGE_SIZE, vma->vm_page_prot))
1564                         return -EAGAIN;
1565
1566                 mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
1567                             vma->vm_start,
1568                             (unsigned long long)pfn << PAGE_SHIFT);
1569                 break;
1570
1571         default:
1572                 return -EINVAL;
1573         }
1574
1575         return 0;
1576 }
1577
1578 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1579                                       struct ib_ucontext *context,
1580                                       struct ib_udata *udata)
1581 {
1582         struct mlx5_ib_alloc_pd_resp resp;
1583         struct mlx5_ib_pd *pd;
1584         int err;
1585
1586         pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1587         if (!pd)
1588                 return ERR_PTR(-ENOMEM);
1589
1590         err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1591         if (err) {
1592                 kfree(pd);
1593                 return ERR_PTR(err);
1594         }
1595
1596         if (context) {
1597                 resp.pdn = pd->pdn;
1598                 if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1599                         mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1600                         kfree(pd);
1601                         return ERR_PTR(-EFAULT);
1602                 }
1603         }
1604
1605         return &pd->ibpd;
1606 }
1607
1608 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1609 {
1610         struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1611         struct mlx5_ib_pd *mpd = to_mpd(pd);
1612
1613         mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1614         kfree(mpd);
1615
1616         return 0;
1617 }
1618
1619 enum {
1620         MATCH_CRITERIA_ENABLE_OUTER_BIT,
1621         MATCH_CRITERIA_ENABLE_MISC_BIT,
1622         MATCH_CRITERIA_ENABLE_INNER_BIT
1623 };
1624
1625 #define HEADER_IS_ZERO(match_criteria, headers)                            \
1626         !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
1627                     0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
1628
1629 static u8 get_match_criteria_enable(u32 *match_criteria)
1630 {
1631         u8 match_criteria_enable;
1632
1633         match_criteria_enable =
1634                 (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
1635                 MATCH_CRITERIA_ENABLE_OUTER_BIT;
1636         match_criteria_enable |=
1637                 (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
1638                 MATCH_CRITERIA_ENABLE_MISC_BIT;
1639         match_criteria_enable |=
1640                 (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
1641                 MATCH_CRITERIA_ENABLE_INNER_BIT;
1642
1643         return match_criteria_enable;
1644 }
1645
1646 static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
1647 {
1648         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
1649         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
1650 }
1651
1652 static void set_flow_label(void *misc_c, void *misc_v, u8 mask, u8 val,
1653                            bool inner)
1654 {
1655         if (inner) {
1656                 MLX5_SET(fte_match_set_misc,
1657                          misc_c, inner_ipv6_flow_label, mask);
1658                 MLX5_SET(fte_match_set_misc,
1659                          misc_v, inner_ipv6_flow_label, val);
1660         } else {
1661                 MLX5_SET(fte_match_set_misc,
1662                          misc_c, outer_ipv6_flow_label, mask);
1663                 MLX5_SET(fte_match_set_misc,
1664                          misc_v, outer_ipv6_flow_label, val);
1665         }
1666 }
1667
1668 static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
1669 {
1670         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
1671         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
1672         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
1673         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
1674 }
1675
1676 #define LAST_ETH_FIELD vlan_tag
1677 #define LAST_IB_FIELD sl
1678 #define LAST_IPV4_FIELD tos
1679 #define LAST_IPV6_FIELD traffic_class
1680 #define LAST_TCP_UDP_FIELD src_port
1681 #define LAST_TUNNEL_FIELD tunnel_id
1682 #define LAST_FLOW_TAG_FIELD tag_id
1683
1684 /* Field is the last supported field */
1685 #define FIELDS_NOT_SUPPORTED(filter, field)\
1686         memchr_inv((void *)&filter.field  +\
1687                    sizeof(filter.field), 0,\
1688                    sizeof(filter) -\
1689                    offsetof(typeof(filter), field) -\
1690                    sizeof(filter.field))
1691
1692 static int parse_flow_attr(u32 *match_c, u32 *match_v,
1693                            const union ib_flow_spec *ib_spec, u32 *tag_id)
1694 {
1695         void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
1696                                            misc_parameters);
1697         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
1698                                            misc_parameters);
1699         void *headers_c;
1700         void *headers_v;
1701
1702         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
1703                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1704                                          inner_headers);
1705                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1706                                          inner_headers);
1707         } else {
1708                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1709                                          outer_headers);
1710                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1711                                          outer_headers);
1712         }
1713
1714         switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
1715         case IB_FLOW_SPEC_ETH:
1716                 if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
1717                         return -EOPNOTSUPP;
1718
1719                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1720                                              dmac_47_16),
1721                                 ib_spec->eth.mask.dst_mac);
1722                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1723                                              dmac_47_16),
1724                                 ib_spec->eth.val.dst_mac);
1725
1726                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1727                                              smac_47_16),
1728                                 ib_spec->eth.mask.src_mac);
1729                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1730                                              smac_47_16),
1731                                 ib_spec->eth.val.src_mac);
1732
1733                 if (ib_spec->eth.mask.vlan_tag) {
1734                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1735                                  vlan_tag, 1);
1736                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1737                                  vlan_tag, 1);
1738
1739                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1740                                  first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1741                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1742                                  first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1743
1744                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1745                                  first_cfi,
1746                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1747                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1748                                  first_cfi,
1749                                  ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1750
1751                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1752                                  first_prio,
1753                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1754                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1755                                  first_prio,
1756                                  ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1757                 }
1758                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1759                          ethertype, ntohs(ib_spec->eth.mask.ether_type));
1760                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1761                          ethertype, ntohs(ib_spec->eth.val.ether_type));
1762                 break;
1763         case IB_FLOW_SPEC_IPV4:
1764                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
1765                         return -EOPNOTSUPP;
1766
1767                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1768                          ethertype, 0xffff);
1769                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1770                          ethertype, ETH_P_IP);
1771
1772                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1773                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
1774                        &ib_spec->ipv4.mask.src_ip,
1775                        sizeof(ib_spec->ipv4.mask.src_ip));
1776                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1777                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
1778                        &ib_spec->ipv4.val.src_ip,
1779                        sizeof(ib_spec->ipv4.val.src_ip));
1780                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1781                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1782                        &ib_spec->ipv4.mask.dst_ip,
1783                        sizeof(ib_spec->ipv4.mask.dst_ip));
1784                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1785                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1786                        &ib_spec->ipv4.val.dst_ip,
1787                        sizeof(ib_spec->ipv4.val.dst_ip));
1788
1789                 set_tos(headers_c, headers_v,
1790                         ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
1791
1792                 set_proto(headers_c, headers_v,
1793                           ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
1794                 break;
1795         case IB_FLOW_SPEC_IPV6:
1796                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
1797                         return -EOPNOTSUPP;
1798
1799                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1800                          ethertype, 0xffff);
1801                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1802                          ethertype, ETH_P_IPV6);
1803
1804                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1805                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
1806                        &ib_spec->ipv6.mask.src_ip,
1807                        sizeof(ib_spec->ipv6.mask.src_ip));
1808                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1809                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
1810                        &ib_spec->ipv6.val.src_ip,
1811                        sizeof(ib_spec->ipv6.val.src_ip));
1812                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1813                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1814                        &ib_spec->ipv6.mask.dst_ip,
1815                        sizeof(ib_spec->ipv6.mask.dst_ip));
1816                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1817                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1818                        &ib_spec->ipv6.val.dst_ip,
1819                        sizeof(ib_spec->ipv6.val.dst_ip));
1820
1821                 set_tos(headers_c, headers_v,
1822                         ib_spec->ipv6.mask.traffic_class,
1823                         ib_spec->ipv6.val.traffic_class);
1824
1825                 set_proto(headers_c, headers_v,
1826                           ib_spec->ipv6.mask.next_hdr,
1827                           ib_spec->ipv6.val.next_hdr);
1828
1829                 set_flow_label(misc_params_c, misc_params_v,
1830                                ntohl(ib_spec->ipv6.mask.flow_label),
1831                                ntohl(ib_spec->ipv6.val.flow_label),
1832                                ib_spec->type & IB_FLOW_SPEC_INNER);
1833
1834                 break;
1835         case IB_FLOW_SPEC_TCP:
1836                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1837                                          LAST_TCP_UDP_FIELD))
1838                         return -EOPNOTSUPP;
1839
1840                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
1841                          0xff);
1842                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
1843                          IPPROTO_TCP);
1844
1845                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
1846                          ntohs(ib_spec->tcp_udp.mask.src_port));
1847                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
1848                          ntohs(ib_spec->tcp_udp.val.src_port));
1849
1850                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
1851                          ntohs(ib_spec->tcp_udp.mask.dst_port));
1852                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
1853                          ntohs(ib_spec->tcp_udp.val.dst_port));
1854                 break;
1855         case IB_FLOW_SPEC_UDP:
1856                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1857                                          LAST_TCP_UDP_FIELD))
1858                         return -EOPNOTSUPP;
1859
1860                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
1861                          0xff);
1862                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
1863                          IPPROTO_UDP);
1864
1865                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
1866                          ntohs(ib_spec->tcp_udp.mask.src_port));
1867                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
1868                          ntohs(ib_spec->tcp_udp.val.src_port));
1869
1870                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
1871                          ntohs(ib_spec->tcp_udp.mask.dst_port));
1872                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
1873                          ntohs(ib_spec->tcp_udp.val.dst_port));
1874                 break;
1875         case IB_FLOW_SPEC_VXLAN_TUNNEL:
1876                 if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
1877                                          LAST_TUNNEL_FIELD))
1878                         return -EOPNOTSUPP;
1879
1880                 MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
1881                          ntohl(ib_spec->tunnel.mask.tunnel_id));
1882                 MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
1883                          ntohl(ib_spec->tunnel.val.tunnel_id));
1884                 break;
1885         case IB_FLOW_SPEC_ACTION_TAG:
1886                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
1887                                          LAST_FLOW_TAG_FIELD))
1888                         return -EOPNOTSUPP;
1889                 if (ib_spec->flow_tag.tag_id >= BIT(24))
1890                         return -EINVAL;
1891
1892                 *tag_id = ib_spec->flow_tag.tag_id;
1893                 break;
1894         default:
1895                 return -EINVAL;
1896         }
1897
1898         return 0;
1899 }
1900
1901 /* If a flow could catch both multicast and unicast packets,
1902  * it won't fall into the multicast flow steering table and this rule
1903  * could steal other multicast packets.
1904  */
1905 static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
1906 {
1907         struct ib_flow_spec_eth *eth_spec;
1908
1909         if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
1910             ib_attr->size < sizeof(struct ib_flow_attr) +
1911             sizeof(struct ib_flow_spec_eth) ||
1912             ib_attr->num_of_specs < 1)
1913                 return false;
1914
1915         eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
1916         if (eth_spec->type != IB_FLOW_SPEC_ETH ||
1917             eth_spec->size != sizeof(*eth_spec))
1918                 return false;
1919
1920         return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
1921                is_multicast_ether_addr(eth_spec->val.dst_mac);
1922 }
1923
1924 static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
1925 {
1926         union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
1927         bool has_ipv4_spec = false;
1928         bool eth_type_ipv4 = true;
1929         unsigned int spec_index;
1930
1931         /* Validate that ethertype is correct */
1932         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1933                 if (ib_spec->type == IB_FLOW_SPEC_ETH &&
1934                     ib_spec->eth.mask.ether_type) {
1935                         if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
1936                               ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
1937                                 eth_type_ipv4 = false;
1938                 } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
1939                         has_ipv4_spec = true;
1940                 }
1941                 ib_spec = (void *)ib_spec + ib_spec->size;
1942         }
1943         return !has_ipv4_spec || eth_type_ipv4;
1944 }
1945
1946 static void put_flow_table(struct mlx5_ib_dev *dev,
1947                            struct mlx5_ib_flow_prio *prio, bool ft_added)
1948 {
1949         prio->refcount -= !!ft_added;
1950         if (!prio->refcount) {
1951                 mlx5_destroy_flow_table(prio->flow_table);
1952                 prio->flow_table = NULL;
1953         }
1954 }
1955
1956 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
1957 {
1958         struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
1959         struct mlx5_ib_flow_handler *handler = container_of(flow_id,
1960                                                           struct mlx5_ib_flow_handler,
1961                                                           ibflow);
1962         struct mlx5_ib_flow_handler *iter, *tmp;
1963
1964         mutex_lock(&dev->flow_db.lock);
1965
1966         list_for_each_entry_safe(iter, tmp, &handler->list, list) {
1967                 mlx5_del_flow_rules(iter->rule);
1968                 put_flow_table(dev, iter->prio, true);
1969                 list_del(&iter->list);
1970                 kfree(iter);
1971         }
1972
1973         mlx5_del_flow_rules(handler->rule);
1974         put_flow_table(dev, handler->prio, true);
1975         mutex_unlock(&dev->flow_db.lock);
1976
1977         kfree(handler);
1978
1979         return 0;
1980 }
1981
1982 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
1983 {
1984         priority *= 2;
1985         if (!dont_trap)
1986                 priority++;
1987         return priority;
1988 }
1989
1990 enum flow_table_type {
1991         MLX5_IB_FT_RX,
1992         MLX5_IB_FT_TX
1993 };
1994
1995 #define MLX5_FS_MAX_TYPES        10
1996 #define MLX5_FS_MAX_ENTRIES      32000UL
1997 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
1998                                                 struct ib_flow_attr *flow_attr,
1999                                                 enum flow_table_type ft_type)
2000 {
2001         bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
2002         struct mlx5_flow_namespace *ns = NULL;
2003         struct mlx5_ib_flow_prio *prio;
2004         struct mlx5_flow_table *ft;
2005         int num_entries;
2006         int num_groups;
2007         int priority;
2008         int err = 0;
2009
2010         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2011                 if (flow_is_multicast_only(flow_attr) &&
2012                     !dont_trap)
2013                         priority = MLX5_IB_FLOW_MCAST_PRIO;
2014                 else
2015                         priority = ib_prio_to_core_prio(flow_attr->priority,
2016                                                         dont_trap);
2017                 ns = mlx5_get_flow_namespace(dev->mdev,
2018                                              MLX5_FLOW_NAMESPACE_BYPASS);
2019                 num_entries = MLX5_FS_MAX_ENTRIES;
2020                 num_groups = MLX5_FS_MAX_TYPES;
2021                 prio = &dev->flow_db.prios[priority];
2022         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2023                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2024                 ns = mlx5_get_flow_namespace(dev->mdev,
2025                                              MLX5_FLOW_NAMESPACE_LEFTOVERS);
2026                 build_leftovers_ft_param(&priority,
2027                                          &num_entries,
2028                                          &num_groups);
2029                 prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
2030         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2031                 if (!MLX5_CAP_FLOWTABLE(dev->mdev,
2032                                         allow_sniffer_and_nic_rx_shared_tir))
2033                         return ERR_PTR(-ENOTSUPP);
2034
2035                 ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
2036                                              MLX5_FLOW_NAMESPACE_SNIFFER_RX :
2037                                              MLX5_FLOW_NAMESPACE_SNIFFER_TX);
2038
2039                 prio = &dev->flow_db.sniffer[ft_type];
2040                 priority = 0;
2041                 num_entries = 1;
2042                 num_groups = 1;
2043         }
2044
2045         if (!ns)
2046                 return ERR_PTR(-ENOTSUPP);
2047
2048         ft = prio->flow_table;
2049         if (!ft) {
2050                 ft = mlx5_create_auto_grouped_flow_table(ns, priority,
2051                                                          num_entries,
2052                                                          num_groups,
2053                                                          0, 0);
2054
2055                 if (!IS_ERR(ft)) {
2056                         prio->refcount = 0;
2057                         prio->flow_table = ft;
2058                 } else {
2059                         err = PTR_ERR(ft);
2060                 }
2061         }
2062
2063         return err ? ERR_PTR(err) : prio;
2064 }
2065
2066 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
2067                                                      struct mlx5_ib_flow_prio *ft_prio,
2068                                                      const struct ib_flow_attr *flow_attr,
2069                                                      struct mlx5_flow_destination *dst)
2070 {
2071         struct mlx5_flow_table  *ft = ft_prio->flow_table;
2072         struct mlx5_ib_flow_handler *handler;
2073         struct mlx5_flow_act flow_act = {0};
2074         struct mlx5_flow_spec *spec;
2075         const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
2076         unsigned int spec_index;
2077         u32 flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
2078         int err = 0;
2079
2080         if (!is_valid_attr(flow_attr))
2081                 return ERR_PTR(-EINVAL);
2082
2083         spec = mlx5_vzalloc(sizeof(*spec));
2084         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
2085         if (!handler || !spec) {
2086                 err = -ENOMEM;
2087                 goto free;
2088         }
2089
2090         INIT_LIST_HEAD(&handler->list);
2091
2092         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
2093                 err = parse_flow_attr(spec->match_criteria,
2094                                       spec->match_value, ib_flow, &flow_tag);
2095                 if (err < 0)
2096                         goto free;
2097
2098                 ib_flow += ((union ib_flow_spec *)ib_flow)->size;
2099         }
2100
2101         spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
2102         flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
2103                 MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
2104
2105         if (flow_tag != MLX5_FS_DEFAULT_FLOW_TAG &&
2106             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2107              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
2108                 mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
2109                              flow_tag, flow_attr->type);
2110                 err = -EINVAL;
2111                 goto free;
2112         }
2113         flow_act.flow_tag = flow_tag;
2114         handler->rule = mlx5_add_flow_rules(ft, spec,
2115                                             &flow_act,
2116                                             dst, 1);
2117
2118         if (IS_ERR(handler->rule)) {
2119                 err = PTR_ERR(handler->rule);
2120                 goto free;
2121         }
2122
2123         ft_prio->refcount++;
2124         handler->prio = ft_prio;
2125
2126         ft_prio->flow_table = ft;
2127 free:
2128         if (err)
2129                 kfree(handler);
2130         kvfree(spec);
2131         return err ? ERR_PTR(err) : handler;
2132 }
2133
2134 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
2135                                                           struct mlx5_ib_flow_prio *ft_prio,
2136                                                           struct ib_flow_attr *flow_attr,
2137                                                           struct mlx5_flow_destination *dst)
2138 {
2139         struct mlx5_ib_flow_handler *handler_dst = NULL;
2140         struct mlx5_ib_flow_handler *handler = NULL;
2141
2142         handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
2143         if (!IS_ERR(handler)) {
2144                 handler_dst = create_flow_rule(dev, ft_prio,
2145                                                flow_attr, dst);
2146                 if (IS_ERR(handler_dst)) {
2147                         mlx5_del_flow_rules(handler->rule);
2148                         ft_prio->refcount--;
2149                         kfree(handler);
2150                         handler = handler_dst;
2151                 } else {
2152                         list_add(&handler_dst->list, &handler->list);
2153                 }
2154         }
2155
2156         return handler;
2157 }
2158 enum {
2159         LEFTOVERS_MC,
2160         LEFTOVERS_UC,
2161 };
2162
2163 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
2164                                                           struct mlx5_ib_flow_prio *ft_prio,
2165                                                           struct ib_flow_attr *flow_attr,
2166                                                           struct mlx5_flow_destination *dst)
2167 {
2168         struct mlx5_ib_flow_handler *handler_ucast = NULL;
2169         struct mlx5_ib_flow_handler *handler = NULL;
2170
2171         static struct {
2172                 struct ib_flow_attr     flow_attr;
2173                 struct ib_flow_spec_eth eth_flow;
2174         } leftovers_specs[] = {
2175                 [LEFTOVERS_MC] = {
2176                         .flow_attr = {
2177                                 .num_of_specs = 1,
2178                                 .size = sizeof(leftovers_specs[0])
2179                         },
2180                         .eth_flow = {
2181                                 .type = IB_FLOW_SPEC_ETH,
2182                                 .size = sizeof(struct ib_flow_spec_eth),
2183                                 .mask = {.dst_mac = {0x1} },
2184                                 .val =  {.dst_mac = {0x1} }
2185                         }
2186                 },
2187                 [LEFTOVERS_UC] = {
2188                         .flow_attr = {
2189                                 .num_of_specs = 1,
2190                                 .size = sizeof(leftovers_specs[0])
2191                         },
2192                         .eth_flow = {
2193                                 .type = IB_FLOW_SPEC_ETH,
2194                                 .size = sizeof(struct ib_flow_spec_eth),
2195                                 .mask = {.dst_mac = {0x1} },
2196                                 .val = {.dst_mac = {} }
2197                         }
2198                 }
2199         };
2200
2201         handler = create_flow_rule(dev, ft_prio,
2202                                    &leftovers_specs[LEFTOVERS_MC].flow_attr,
2203                                    dst);
2204         if (!IS_ERR(handler) &&
2205             flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
2206                 handler_ucast = create_flow_rule(dev, ft_prio,
2207                                                  &leftovers_specs[LEFTOVERS_UC].flow_attr,
2208                                                  dst);
2209                 if (IS_ERR(handler_ucast)) {
2210                         mlx5_del_flow_rules(handler->rule);
2211                         ft_prio->refcount--;
2212                         kfree(handler);
2213                         handler = handler_ucast;
2214                 } else {
2215                         list_add(&handler_ucast->list, &handler->list);
2216                 }
2217         }
2218
2219         return handler;
2220 }
2221
2222 static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
2223                                                         struct mlx5_ib_flow_prio *ft_rx,
2224                                                         struct mlx5_ib_flow_prio *ft_tx,
2225                                                         struct mlx5_flow_destination *dst)
2226 {
2227         struct mlx5_ib_flow_handler *handler_rx;
2228         struct mlx5_ib_flow_handler *handler_tx;
2229         int err;
2230         static const struct ib_flow_attr flow_attr  = {
2231                 .num_of_specs = 0,
2232                 .size = sizeof(flow_attr)
2233         };
2234
2235         handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
2236         if (IS_ERR(handler_rx)) {
2237                 err = PTR_ERR(handler_rx);
2238                 goto err;
2239         }
2240
2241         handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
2242         if (IS_ERR(handler_tx)) {
2243                 err = PTR_ERR(handler_tx);
2244                 goto err_tx;
2245         }
2246
2247         list_add(&handler_tx->list, &handler_rx->list);
2248
2249         return handler_rx;
2250
2251 err_tx:
2252         mlx5_del_flow_rules(handler_rx->rule);
2253         ft_rx->refcount--;
2254         kfree(handler_rx);
2255 err:
2256         return ERR_PTR(err);
2257 }
2258
2259 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
2260                                            struct ib_flow_attr *flow_attr,
2261                                            int domain)
2262 {
2263         struct mlx5_ib_dev *dev = to_mdev(qp->device);
2264         struct mlx5_ib_qp *mqp = to_mqp(qp);
2265         struct mlx5_ib_flow_handler *handler = NULL;
2266         struct mlx5_flow_destination *dst = NULL;
2267         struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
2268         struct mlx5_ib_flow_prio *ft_prio;
2269         int err;
2270
2271         if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
2272                 return ERR_PTR(-ENOSPC);
2273
2274         if (domain != IB_FLOW_DOMAIN_USER ||
2275             flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
2276             (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
2277                 return ERR_PTR(-EINVAL);
2278
2279         dst = kzalloc(sizeof(*dst), GFP_KERNEL);
2280         if (!dst)
2281                 return ERR_PTR(-ENOMEM);
2282
2283         mutex_lock(&dev->flow_db.lock);
2284
2285         ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX);
2286         if (IS_ERR(ft_prio)) {
2287                 err = PTR_ERR(ft_prio);
2288                 goto unlock;
2289         }
2290         if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2291                 ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
2292                 if (IS_ERR(ft_prio_tx)) {
2293                         err = PTR_ERR(ft_prio_tx);
2294                         ft_prio_tx = NULL;
2295                         goto destroy_ft;
2296                 }
2297         }
2298
2299         dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
2300         if (mqp->flags & MLX5_IB_QP_RSS)
2301                 dst->tir_num = mqp->rss_qp.tirn;
2302         else
2303                 dst->tir_num = mqp->raw_packet_qp.rq.tirn;
2304
2305         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2306                 if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
2307                         handler = create_dont_trap_rule(dev, ft_prio,
2308                                                         flow_attr, dst);
2309                 } else {
2310                         handler = create_flow_rule(dev, ft_prio, flow_attr,
2311                                                    dst);
2312                 }
2313         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2314                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2315                 handler = create_leftovers_rule(dev, ft_prio, flow_attr,
2316                                                 dst);
2317         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2318                 handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
2319         } else {
2320                 err = -EINVAL;
2321                 goto destroy_ft;
2322         }
2323
2324         if (IS_ERR(handler)) {
2325                 err = PTR_ERR(handler);
2326                 handler = NULL;
2327                 goto destroy_ft;
2328         }
2329
2330         mutex_unlock(&dev->flow_db.lock);
2331         kfree(dst);
2332
2333         return &handler->ibflow;
2334
2335 destroy_ft:
2336         put_flow_table(dev, ft_prio, false);
2337         if (ft_prio_tx)
2338                 put_flow_table(dev, ft_prio_tx, false);
2339 unlock:
2340         mutex_unlock(&dev->flow_db.lock);
2341         kfree(dst);
2342         kfree(handler);
2343         return ERR_PTR(err);
2344 }
2345
2346 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2347 {
2348         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2349         int err;
2350
2351         err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
2352         if (err)
2353                 mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2354                              ibqp->qp_num, gid->raw);
2355
2356         return err;
2357 }
2358
2359 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2360 {
2361         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2362         int err;
2363
2364         err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
2365         if (err)
2366                 mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2367                              ibqp->qp_num, gid->raw);
2368
2369         return err;
2370 }
2371
2372 static int init_node_data(struct mlx5_ib_dev *dev)
2373 {
2374         int err;
2375
2376         err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2377         if (err)
2378                 return err;
2379
2380         dev->mdev->rev_id = dev->mdev->pdev->revision;
2381
2382         return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2383 }
2384
2385 static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
2386                              char *buf)
2387 {
2388         struct mlx5_ib_dev *dev =
2389                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2390
2391         return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
2392 }
2393
2394 static ssize_t show_reg_pages(struct device *device,
2395                               struct device_attribute *attr, char *buf)
2396 {
2397         struct mlx5_ib_dev *dev =
2398                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2399
2400         return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2401 }
2402
2403 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
2404                         char *buf)
2405 {
2406         struct mlx5_ib_dev *dev =
2407                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2408         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
2409 }
2410
2411 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
2412                         char *buf)
2413 {
2414         struct mlx5_ib_dev *dev =
2415                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2416         return sprintf(buf, "%x\n", dev->mdev->rev_id);
2417 }
2418
2419 static ssize_t show_board(struct device *device, struct device_attribute *attr,
2420                           char *buf)
2421 {
2422         struct mlx5_ib_dev *dev =
2423                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2424         return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2425                        dev->mdev->board_id);
2426 }
2427
2428 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
2429 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
2430 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
2431 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
2432 static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
2433
2434 static struct device_attribute *mlx5_class_attributes[] = {
2435         &dev_attr_hw_rev,
2436         &dev_attr_hca_type,
2437         &dev_attr_board_id,
2438         &dev_attr_fw_pages,
2439         &dev_attr_reg_pages,
2440 };
2441
2442 static void pkey_change_handler(struct work_struct *work)
2443 {
2444         struct mlx5_ib_port_resources *ports =
2445                 container_of(work, struct mlx5_ib_port_resources,
2446                              pkey_change_work);
2447
2448         mutex_lock(&ports->devr->mutex);
2449         mlx5_ib_gsi_pkey_change(ports->gsi);
2450         mutex_unlock(&ports->devr->mutex);
2451 }
2452
2453 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2454 {
2455         struct mlx5_ib_qp *mqp;
2456         struct mlx5_ib_cq *send_mcq, *recv_mcq;
2457         struct mlx5_core_cq *mcq;
2458         struct list_head cq_armed_list;
2459         unsigned long flags_qp;
2460         unsigned long flags_cq;
2461         unsigned long flags;
2462
2463         INIT_LIST_HEAD(&cq_armed_list);
2464
2465         /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2466         spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2467         list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2468                 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2469                 if (mqp->sq.tail != mqp->sq.head) {
2470                         send_mcq = to_mcq(mqp->ibqp.send_cq);
2471                         spin_lock_irqsave(&send_mcq->lock, flags_cq);
2472                         if (send_mcq->mcq.comp &&
2473                             mqp->ibqp.send_cq->comp_handler) {
2474                                 if (!send_mcq->mcq.reset_notify_added) {
2475                                         send_mcq->mcq.reset_notify_added = 1;
2476                                         list_add_tail(&send_mcq->mcq.reset_notify,
2477                                                       &cq_armed_list);
2478                                 }
2479                         }
2480                         spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2481                 }
2482                 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2483                 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2484                 /* no handling is needed for SRQ */
2485                 if (!mqp->ibqp.srq) {
2486                         if (mqp->rq.tail != mqp->rq.head) {
2487                                 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2488                                 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2489                                 if (recv_mcq->mcq.comp &&
2490                                     mqp->ibqp.recv_cq->comp_handler) {
2491                                         if (!recv_mcq->mcq.reset_notify_added) {
2492                                                 recv_mcq->mcq.reset_notify_added = 1;
2493                                                 list_add_tail(&recv_mcq->mcq.reset_notify,
2494                                                               &cq_armed_list);
2495                                         }
2496                                 }
2497                                 spin_unlock_irqrestore(&recv_mcq->lock,
2498                                                        flags_cq);
2499                         }
2500                 }
2501                 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2502         }
2503         /*At that point all inflight post send were put to be executed as of we
2504          * lock/unlock above locks Now need to arm all involved CQs.
2505          */
2506         list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2507                 mcq->comp(mcq);
2508         }
2509         spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2510 }
2511
2512 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
2513                           enum mlx5_dev_event event, unsigned long param)
2514 {
2515         struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
2516         struct ib_event ibev;
2517         bool fatal = false;
2518         u8 port = 0;
2519
2520         switch (event) {
2521         case MLX5_DEV_EVENT_SYS_ERROR:
2522                 ibev.event = IB_EVENT_DEVICE_FATAL;
2523                 mlx5_ib_handle_internal_error(ibdev);
2524                 fatal = true;
2525                 break;
2526
2527         case MLX5_DEV_EVENT_PORT_UP:
2528         case MLX5_DEV_EVENT_PORT_DOWN:
2529         case MLX5_DEV_EVENT_PORT_INITIALIZED:
2530                 port = (u8)param;
2531
2532                 /* In RoCE, port up/down events are handled in
2533                  * mlx5_netdev_event().
2534                  */
2535                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2536                         IB_LINK_LAYER_ETHERNET)
2537                         return;
2538
2539                 ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ?
2540                              IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2541                 break;
2542
2543         case MLX5_DEV_EVENT_LID_CHANGE:
2544                 ibev.event = IB_EVENT_LID_CHANGE;
2545                 port = (u8)param;
2546                 break;
2547
2548         case MLX5_DEV_EVENT_PKEY_CHANGE:
2549                 ibev.event = IB_EVENT_PKEY_CHANGE;
2550                 port = (u8)param;
2551
2552                 schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2553                 break;
2554
2555         case MLX5_DEV_EVENT_GUID_CHANGE:
2556                 ibev.event = IB_EVENT_GID_CHANGE;
2557                 port = (u8)param;
2558                 break;
2559
2560         case MLX5_DEV_EVENT_CLIENT_REREG:
2561                 ibev.event = IB_EVENT_CLIENT_REREGISTER;
2562                 port = (u8)param;
2563                 break;
2564         default:
2565                 return;
2566         }
2567
2568         ibev.device           = &ibdev->ib_dev;
2569         ibev.element.port_num = port;
2570
2571         if (port < 1 || port > ibdev->num_ports) {
2572                 mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
2573                 return;
2574         }
2575
2576         if (ibdev->ib_active)
2577                 ib_dispatch_event(&ibev);
2578
2579         if (fatal)
2580                 ibdev->ib_active = false;
2581 }
2582
2583 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
2584 {
2585         struct mlx5_hca_vport_context vport_ctx;
2586         int err;
2587         int port;
2588
2589         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2590                 dev->mdev->port_caps[port - 1].has_smi = false;
2591                 if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2592                     MLX5_CAP_PORT_TYPE_IB) {
2593                         if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
2594                                 err = mlx5_query_hca_vport_context(dev->mdev, 0,
2595                                                                    port, 0,
2596                                                                    &vport_ctx);
2597                                 if (err) {
2598                                         mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
2599                                                     port, err);
2600                                         return err;
2601                                 }
2602                                 dev->mdev->port_caps[port - 1].has_smi =
2603                                         vport_ctx.has_smi;
2604                         } else {
2605                                 dev->mdev->port_caps[port - 1].has_smi = true;
2606                         }
2607                 }
2608         }
2609         return 0;
2610 }
2611
2612 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2613 {
2614         int port;
2615
2616         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
2617                 mlx5_query_ext_port_caps(dev, port);
2618 }
2619
2620 static int get_port_caps(struct mlx5_ib_dev *dev)
2621 {
2622         struct ib_device_attr *dprops = NULL;
2623         struct ib_port_attr *pprops = NULL;
2624         int err = -ENOMEM;
2625         int port;
2626         struct ib_udata uhw = {.inlen = 0, .outlen = 0};
2627
2628         pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
2629         if (!pprops)
2630                 goto out;
2631
2632         dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
2633         if (!dprops)
2634                 goto out;
2635
2636         err = set_has_smi_cap(dev);
2637         if (err)
2638                 goto out;
2639
2640         err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
2641         if (err) {
2642                 mlx5_ib_warn(dev, "query_device failed %d\n", err);
2643                 goto out;
2644         }
2645
2646         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2647                 err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
2648                 if (err) {
2649                         mlx5_ib_warn(dev, "query_port %d failed %d\n",
2650                                      port, err);
2651                         break;
2652                 }
2653                 dev->mdev->port_caps[port - 1].pkey_table_len =
2654                                                 dprops->max_pkeys;
2655                 dev->mdev->port_caps[port - 1].gid_table_len =
2656                                                 pprops->gid_tbl_len;
2657                 mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
2658                             dprops->max_pkeys, pprops->gid_tbl_len);
2659         }
2660
2661 out:
2662         kfree(pprops);
2663         kfree(dprops);
2664
2665         return err;
2666 }
2667
2668 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
2669 {
2670         int err;
2671
2672         err = mlx5_mr_cache_cleanup(dev);
2673         if (err)
2674                 mlx5_ib_warn(dev, "mr cache cleanup failed\n");
2675
2676         mlx5_ib_destroy_qp(dev->umrc.qp);
2677         ib_free_cq(dev->umrc.cq);
2678         ib_dealloc_pd(dev->umrc.pd);
2679 }
2680
2681 enum {
2682         MAX_UMR_WR = 128,
2683 };
2684
2685 static int create_umr_res(struct mlx5_ib_dev *dev)
2686 {
2687         struct ib_qp_init_attr *init_attr = NULL;
2688         struct ib_qp_attr *attr = NULL;
2689         struct ib_pd *pd;
2690         struct ib_cq *cq;
2691         struct ib_qp *qp;
2692         int ret;
2693
2694         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
2695         init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
2696         if (!attr || !init_attr) {
2697                 ret = -ENOMEM;
2698                 goto error_0;
2699         }
2700
2701         pd = ib_alloc_pd(&dev->ib_dev, 0);
2702         if (IS_ERR(pd)) {
2703                 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
2704                 ret = PTR_ERR(pd);
2705                 goto error_0;
2706         }
2707
2708         cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
2709         if (IS_ERR(cq)) {
2710                 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
2711                 ret = PTR_ERR(cq);
2712                 goto error_2;
2713         }
2714
2715         init_attr->send_cq = cq;
2716         init_attr->recv_cq = cq;
2717         init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
2718         init_attr->cap.max_send_wr = MAX_UMR_WR;
2719         init_attr->cap.max_send_sge = 1;
2720         init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
2721         init_attr->port_num = 1;
2722         qp = mlx5_ib_create_qp(pd, init_attr, NULL);
2723         if (IS_ERR(qp)) {
2724                 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
2725                 ret = PTR_ERR(qp);
2726                 goto error_3;
2727         }
2728         qp->device     = &dev->ib_dev;
2729         qp->real_qp    = qp;
2730         qp->uobject    = NULL;
2731         qp->qp_type    = MLX5_IB_QPT_REG_UMR;
2732
2733         attr->qp_state = IB_QPS_INIT;
2734         attr->port_num = 1;
2735         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
2736                                 IB_QP_PORT, NULL);
2737         if (ret) {
2738                 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
2739                 goto error_4;
2740         }
2741
2742         memset(attr, 0, sizeof(*attr));
2743         attr->qp_state = IB_QPS_RTR;
2744         attr->path_mtu = IB_MTU_256;
2745
2746         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2747         if (ret) {
2748                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
2749                 goto error_4;
2750         }
2751
2752         memset(attr, 0, sizeof(*attr));
2753         attr->qp_state = IB_QPS_RTS;
2754         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2755         if (ret) {
2756                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
2757                 goto error_4;
2758         }
2759
2760         dev->umrc.qp = qp;
2761         dev->umrc.cq = cq;
2762         dev->umrc.pd = pd;
2763
2764         sema_init(&dev->umrc.sem, MAX_UMR_WR);
2765         ret = mlx5_mr_cache_init(dev);
2766         if (ret) {
2767                 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2768                 goto error_4;
2769         }
2770
2771         kfree(attr);
2772         kfree(init_attr);
2773
2774         return 0;
2775
2776 error_4:
2777         mlx5_ib_destroy_qp(qp);
2778
2779 error_3:
2780         ib_free_cq(cq);
2781
2782 error_2:
2783         ib_dealloc_pd(pd);
2784
2785 error_0:
2786         kfree(attr);
2787         kfree(init_attr);
2788         return ret;
2789 }
2790
2791 static int create_dev_resources(struct mlx5_ib_resources *devr)
2792 {
2793         struct ib_srq_init_attr attr;
2794         struct mlx5_ib_dev *dev;
2795         struct ib_cq_init_attr cq_attr = {.cqe = 1};
2796         int port;
2797         int ret = 0;
2798
2799         dev = container_of(devr, struct mlx5_ib_dev, devr);
2800
2801         mutex_init(&devr->mutex);
2802
2803         devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
2804         if (IS_ERR(devr->p0)) {
2805                 ret = PTR_ERR(devr->p0);
2806                 goto error0;
2807         }
2808         devr->p0->device  = &dev->ib_dev;
2809         devr->p0->uobject = NULL;
2810         atomic_set(&devr->p0->usecnt, 0);
2811
2812         devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
2813         if (IS_ERR(devr->c0)) {
2814                 ret = PTR_ERR(devr->c0);
2815                 goto error1;
2816         }
2817         devr->c0->device        = &dev->ib_dev;
2818         devr->c0->uobject       = NULL;
2819         devr->c0->comp_handler  = NULL;
2820         devr->c0->event_handler = NULL;
2821         devr->c0->cq_context    = NULL;
2822         atomic_set(&devr->c0->usecnt, 0);
2823
2824         devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2825         if (IS_ERR(devr->x0)) {
2826                 ret = PTR_ERR(devr->x0);
2827                 goto error2;
2828         }
2829         devr->x0->device = &dev->ib_dev;
2830         devr->x0->inode = NULL;
2831         atomic_set(&devr->x0->usecnt, 0);
2832         mutex_init(&devr->x0->tgt_qp_mutex);
2833         INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2834
2835         devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2836         if (IS_ERR(devr->x1)) {
2837                 ret = PTR_ERR(devr->x1);
2838                 goto error3;
2839         }
2840         devr->x1->device = &dev->ib_dev;
2841         devr->x1->inode = NULL;
2842         atomic_set(&devr->x1->usecnt, 0);
2843         mutex_init(&devr->x1->tgt_qp_mutex);
2844         INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2845
2846         memset(&attr, 0, sizeof(attr));
2847         attr.attr.max_sge = 1;
2848         attr.attr.max_wr = 1;
2849         attr.srq_type = IB_SRQT_XRC;
2850         attr.ext.xrc.cq = devr->c0;
2851         attr.ext.xrc.xrcd = devr->x0;
2852
2853         devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2854         if (IS_ERR(devr->s0)) {
2855                 ret = PTR_ERR(devr->s0);
2856                 goto error4;
2857         }
2858         devr->s0->device        = &dev->ib_dev;
2859         devr->s0->pd            = devr->p0;
2860         devr->s0->uobject       = NULL;
2861         devr->s0->event_handler = NULL;
2862         devr->s0->srq_context   = NULL;
2863         devr->s0->srq_type      = IB_SRQT_XRC;
2864         devr->s0->ext.xrc.xrcd  = devr->x0;
2865         devr->s0->ext.xrc.cq    = devr->c0;
2866         atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2867         atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
2868         atomic_inc(&devr->p0->usecnt);
2869         atomic_set(&devr->s0->usecnt, 0);
2870
2871         memset(&attr, 0, sizeof(attr));
2872         attr.attr.max_sge = 1;
2873         attr.attr.max_wr = 1;
2874         attr.srq_type = IB_SRQT_BASIC;
2875         devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2876         if (IS_ERR(devr->s1)) {
2877                 ret = PTR_ERR(devr->s1);
2878                 goto error5;
2879         }
2880         devr->s1->device        = &dev->ib_dev;
2881         devr->s1->pd            = devr->p0;
2882         devr->s1->uobject       = NULL;
2883         devr->s1->event_handler = NULL;
2884         devr->s1->srq_context   = NULL;
2885         devr->s1->srq_type      = IB_SRQT_BASIC;
2886         devr->s1->ext.xrc.cq    = devr->c0;
2887         atomic_inc(&devr->p0->usecnt);
2888         atomic_set(&devr->s0->usecnt, 0);
2889
2890         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
2891                 INIT_WORK(&devr->ports[port].pkey_change_work,
2892                           pkey_change_handler);
2893                 devr->ports[port].devr = devr;
2894         }
2895
2896         return 0;
2897
2898 error5:
2899         mlx5_ib_destroy_srq(devr->s0);
2900 error4:
2901         mlx5_ib_dealloc_xrcd(devr->x1);
2902 error3:
2903         mlx5_ib_dealloc_xrcd(devr->x0);
2904 error2:
2905         mlx5_ib_destroy_cq(devr->c0);
2906 error1:
2907         mlx5_ib_dealloc_pd(devr->p0);
2908 error0:
2909         return ret;
2910 }
2911
2912 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
2913 {
2914         struct mlx5_ib_dev *dev =
2915                 container_of(devr, struct mlx5_ib_dev, devr);
2916         int port;
2917
2918         mlx5_ib_destroy_srq(devr->s1);
2919         mlx5_ib_destroy_srq(devr->s0);
2920         mlx5_ib_dealloc_xrcd(devr->x0);
2921         mlx5_ib_dealloc_xrcd(devr->x1);
2922         mlx5_ib_destroy_cq(devr->c0);
2923         mlx5_ib_dealloc_pd(devr->p0);
2924
2925         /* Make sure no change P_Key work items are still executing */
2926         for (port = 0; port < dev->num_ports; ++port)
2927                 cancel_work_sync(&devr->ports[port].pkey_change_work);
2928 }
2929
2930 static u32 get_core_cap_flags(struct ib_device *ibdev)
2931 {
2932         struct mlx5_ib_dev *dev = to_mdev(ibdev);
2933         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
2934         u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
2935         u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
2936         u32 ret = 0;
2937
2938         if (ll == IB_LINK_LAYER_INFINIBAND)
2939                 return RDMA_CORE_PORT_IBA_IB;
2940
2941         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
2942                 return 0;
2943
2944         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
2945                 return 0;
2946
2947         if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
2948                 ret |= RDMA_CORE_PORT_IBA_ROCE;
2949
2950         if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
2951                 ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
2952
2953         return ret;
2954 }
2955
2956 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
2957                                struct ib_port_immutable *immutable)
2958 {
2959         struct ib_port_attr attr;
2960         struct mlx5_ib_dev *dev = to_mdev(ibdev);
2961         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
2962         int err;
2963
2964         err = mlx5_ib_query_port(ibdev, port_num, &attr);
2965         if (err)
2966                 return err;
2967
2968         immutable->pkey_tbl_len = attr.pkey_tbl_len;
2969         immutable->gid_tbl_len = attr.gid_tbl_len;
2970         immutable->core_cap_flags = get_core_cap_flags(ibdev);
2971         if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
2972                 immutable->max_mad_size = IB_MGMT_MAD_SIZE;
2973
2974         return 0;
2975 }
2976
2977 static void get_dev_fw_str(struct ib_device *ibdev, char *str,
2978                            size_t str_len)
2979 {
2980         struct mlx5_ib_dev *dev =
2981                 container_of(ibdev, struct mlx5_ib_dev, ib_dev);
2982         snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
2983                        fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
2984 }
2985
2986 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
2987 {
2988         struct mlx5_core_dev *mdev = dev->mdev;
2989         struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
2990                                                                  MLX5_FLOW_NAMESPACE_LAG);
2991         struct mlx5_flow_table *ft;
2992         int err;
2993
2994         if (!ns || !mlx5_lag_is_active(mdev))
2995                 return 0;
2996
2997         err = mlx5_cmd_create_vport_lag(mdev);
2998         if (err)
2999                 return err;
3000
3001         ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
3002         if (IS_ERR(ft)) {
3003                 err = PTR_ERR(ft);
3004                 goto err_destroy_vport_lag;
3005         }
3006
3007         dev->flow_db.lag_demux_ft = ft;
3008         return 0;
3009
3010 err_destroy_vport_lag:
3011         mlx5_cmd_destroy_vport_lag(mdev);
3012         return err;
3013 }
3014
3015 static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
3016 {
3017         struct mlx5_core_dev *mdev = dev->mdev;
3018
3019         if (dev->flow_db.lag_demux_ft) {
3020                 mlx5_destroy_flow_table(dev->flow_db.lag_demux_ft);
3021                 dev->flow_db.lag_demux_ft = NULL;
3022
3023                 mlx5_cmd_destroy_vport_lag(mdev);
3024         }
3025 }
3026
3027 static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev)
3028 {
3029         int err;
3030
3031         dev->roce.nb.notifier_call = mlx5_netdev_event;
3032         err = register_netdevice_notifier(&dev->roce.nb);
3033         if (err) {
3034                 dev->roce.nb.notifier_call = NULL;
3035                 return err;
3036         }
3037
3038         return 0;
3039 }
3040
3041 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev)
3042 {
3043         if (dev->roce.nb.notifier_call) {
3044                 unregister_netdevice_notifier(&dev->roce.nb);
3045                 dev->roce.nb.notifier_call = NULL;
3046         }
3047 }
3048
3049 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
3050 {
3051         int err;
3052
3053         err = mlx5_add_netdev_notifier(dev);
3054         if (err)
3055                 return err;
3056
3057         if (MLX5_CAP_GEN(dev->mdev, roce)) {
3058                 err = mlx5_nic_vport_enable_roce(dev->mdev);
3059                 if (err)
3060                         goto err_unregister_netdevice_notifier;
3061         }
3062
3063         err = mlx5_eth_lag_init(dev);
3064         if (err)
3065                 goto err_disable_roce;
3066
3067         return 0;
3068
3069 err_disable_roce:
3070         if (MLX5_CAP_GEN(dev->mdev, roce))
3071                 mlx5_nic_vport_disable_roce(dev->mdev);
3072
3073 err_unregister_netdevice_notifier:
3074         mlx5_remove_netdev_notifier(dev);
3075         return err;
3076 }
3077
3078 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
3079 {
3080         mlx5_eth_lag_cleanup(dev);
3081         if (MLX5_CAP_GEN(dev->mdev, roce))
3082                 mlx5_nic_vport_disable_roce(dev->mdev);
3083 }
3084
3085 struct mlx5_ib_q_counter {
3086         const char *name;
3087         size_t offset;
3088 };
3089
3090 #define INIT_Q_COUNTER(_name)           \
3091         { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
3092
3093 static const struct mlx5_ib_q_counter basic_q_cnts[] = {
3094         INIT_Q_COUNTER(rx_write_requests),
3095         INIT_Q_COUNTER(rx_read_requests),
3096         INIT_Q_COUNTER(rx_atomic_requests),
3097         INIT_Q_COUNTER(out_of_buffer),
3098 };
3099
3100 static const struct mlx5_ib_q_counter out_of_seq_q_cnts[] = {
3101         INIT_Q_COUNTER(out_of_sequence),
3102 };
3103
3104 static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
3105         INIT_Q_COUNTER(duplicate_request),
3106         INIT_Q_COUNTER(rnr_nak_retry_err),
3107         INIT_Q_COUNTER(packet_seq_err),
3108         INIT_Q_COUNTER(implied_nak_seq_err),
3109         INIT_Q_COUNTER(local_ack_timeout_err),
3110 };
3111
3112 static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
3113 {
3114         unsigned int i;
3115
3116         for (i = 0; i < dev->num_ports; i++) {
3117                 mlx5_core_dealloc_q_counter(dev->mdev,
3118                                             dev->port[i].q_cnts.set_id);
3119                 kfree(dev->port[i].q_cnts.names);
3120                 kfree(dev->port[i].q_cnts.offsets);
3121         }
3122 }
3123
3124 static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
3125                                       const char ***names,
3126                                       size_t **offsets,
3127                                       u32 *num)
3128 {
3129         u32 num_counters;
3130
3131         num_counters = ARRAY_SIZE(basic_q_cnts);
3132
3133         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
3134                 num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
3135
3136         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
3137                 num_counters += ARRAY_SIZE(retrans_q_cnts);
3138
3139         *names = kcalloc(num_counters, sizeof(**names), GFP_KERNEL);
3140         if (!*names)
3141                 return -ENOMEM;
3142
3143         *offsets = kcalloc(num_counters, sizeof(**offsets), GFP_KERNEL);
3144         if (!*offsets)
3145                 goto err_names;
3146
3147         *num = num_counters;
3148
3149         return 0;
3150
3151 err_names:
3152         kfree(*names);
3153         return -ENOMEM;
3154 }
3155
3156 static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
3157                                     const char **names,
3158                                     size_t *offsets)
3159 {
3160         int i;
3161         int j = 0;
3162
3163         for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
3164                 names[j] = basic_q_cnts[i].name;
3165                 offsets[j] = basic_q_cnts[i].offset;
3166         }
3167
3168         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
3169                 for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
3170                         names[j] = out_of_seq_q_cnts[i].name;
3171                         offsets[j] = out_of_seq_q_cnts[i].offset;
3172                 }
3173         }
3174
3175         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
3176                 for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
3177                         names[j] = retrans_q_cnts[i].name;
3178                         offsets[j] = retrans_q_cnts[i].offset;
3179                 }
3180         }
3181 }
3182
3183 static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
3184 {
3185         int i;
3186         int ret;
3187
3188         for (i = 0; i < dev->num_ports; i++) {
3189                 struct mlx5_ib_port *port = &dev->port[i];
3190
3191                 ret = mlx5_core_alloc_q_counter(dev->mdev,
3192                                                 &port->q_cnts.set_id);
3193                 if (ret) {
3194                         mlx5_ib_warn(dev,
3195                                      "couldn't allocate queue counter for port %d, err %d\n",
3196                                      i + 1, ret);
3197                         goto dealloc_counters;
3198                 }
3199
3200                 ret = __mlx5_ib_alloc_q_counters(dev,
3201                                                  &port->q_cnts.names,
3202                                                  &port->q_cnts.offsets,
3203                                                  &port->q_cnts.num_counters);
3204                 if (ret)
3205                         goto dealloc_counters;
3206
3207                 mlx5_ib_fill_q_counters(dev, port->q_cnts.names,
3208                                         port->q_cnts.offsets);
3209         }
3210
3211         return 0;
3212
3213 dealloc_counters:
3214         while (--i >= 0)
3215                 mlx5_core_dealloc_q_counter(dev->mdev,
3216                                             dev->port[i].q_cnts.set_id);
3217
3218         return ret;
3219 }
3220
3221 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
3222                                                     u8 port_num)
3223 {
3224         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3225         struct mlx5_ib_port *port = &dev->port[port_num - 1];
3226
3227         /* We support only per port stats */
3228         if (port_num == 0)
3229                 return NULL;
3230
3231         return rdma_alloc_hw_stats_struct(port->q_cnts.names,
3232                                           port->q_cnts.num_counters,
3233                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
3234 }
3235
3236 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
3237                                 struct rdma_hw_stats *stats,
3238                                 u8 port_num, int index)
3239 {
3240         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3241         struct mlx5_ib_port *port = &dev->port[port_num - 1];
3242         int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
3243         void *out;
3244         __be32 val;
3245         int ret;
3246         int i;
3247
3248         if (!stats)
3249                 return -ENOSYS;
3250
3251         out = mlx5_vzalloc(outlen);
3252         if (!out)
3253                 return -ENOMEM;
3254
3255         ret = mlx5_core_query_q_counter(dev->mdev,
3256                                         port->q_cnts.set_id, 0,
3257                                         out, outlen);
3258         if (ret)
3259                 goto free;
3260
3261         for (i = 0; i < port->q_cnts.num_counters; i++) {
3262                 val = *(__be32 *)(out + port->q_cnts.offsets[i]);
3263                 stats->value[i] = (u64)be32_to_cpu(val);
3264         }
3265
3266 free:
3267         kvfree(out);
3268         return port->q_cnts.num_counters;
3269 }
3270
3271 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
3272 {
3273         struct mlx5_ib_dev *dev;
3274         enum rdma_link_layer ll;
3275         int port_type_cap;
3276         const char *name;
3277         int err;
3278         int i;
3279
3280         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
3281         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
3282
3283         printk_once(KERN_INFO "%s", mlx5_version);
3284
3285         dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
3286         if (!dev)
3287                 return NULL;
3288
3289         dev->mdev = mdev;
3290
3291         dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
3292                             GFP_KERNEL);
3293         if (!dev->port)
3294                 goto err_dealloc;
3295
3296         rwlock_init(&dev->roce.netdev_lock);
3297         err = get_port_caps(dev);
3298         if (err)
3299                 goto err_free_port;
3300
3301         if (mlx5_use_mad_ifc(dev))
3302                 get_ext_port_caps(dev);
3303
3304         if (!mlx5_lag_is_active(mdev))
3305                 name = "mlx5_%d";
3306         else
3307                 name = "mlx5_bond_%d";
3308
3309         strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
3310         dev->ib_dev.owner               = THIS_MODULE;
3311         dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
3312         dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
3313         dev->num_ports          = MLX5_CAP_GEN(mdev, num_ports);
3314         dev->ib_dev.phys_port_cnt     = dev->num_ports;
3315         dev->ib_dev.num_comp_vectors    =
3316                 dev->mdev->priv.eq_table.num_comp_vectors;
3317         dev->ib_dev.dma_device  = &mdev->pdev->dev;
3318
3319         dev->ib_dev.uverbs_abi_ver      = MLX5_IB_UVERBS_ABI_VERSION;
3320         dev->ib_dev.uverbs_cmd_mask     =
3321                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
3322                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
3323                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
3324                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
3325                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
3326                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
3327                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
3328                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
3329                 (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
3330                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
3331                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
3332                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
3333                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
3334                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
3335                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
3336                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
3337                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
3338                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
3339                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
3340                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
3341                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
3342                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
3343                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
3344                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
3345                 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
3346                 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
3347         dev->ib_dev.uverbs_ex_cmd_mask =
3348                 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
3349                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
3350                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
3351                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP);
3352
3353         dev->ib_dev.query_device        = mlx5_ib_query_device;
3354         dev->ib_dev.query_port          = mlx5_ib_query_port;
3355         dev->ib_dev.get_link_layer      = mlx5_ib_port_link_layer;
3356         if (ll == IB_LINK_LAYER_ETHERNET)
3357                 dev->ib_dev.get_netdev  = mlx5_ib_get_netdev;
3358         dev->ib_dev.query_gid           = mlx5_ib_query_gid;
3359         dev->ib_dev.add_gid             = mlx5_ib_add_gid;
3360         dev->ib_dev.del_gid             = mlx5_ib_del_gid;
3361         dev->ib_dev.query_pkey          = mlx5_ib_query_pkey;
3362         dev->ib_dev.modify_device       = mlx5_ib_modify_device;
3363         dev->ib_dev.modify_port         = mlx5_ib_modify_port;
3364         dev->ib_dev.alloc_ucontext      = mlx5_ib_alloc_ucontext;
3365         dev->ib_dev.dealloc_ucontext    = mlx5_ib_dealloc_ucontext;
3366         dev->ib_dev.mmap                = mlx5_ib_mmap;
3367         dev->ib_dev.alloc_pd            = mlx5_ib_alloc_pd;
3368         dev->ib_dev.dealloc_pd          = mlx5_ib_dealloc_pd;
3369         dev->ib_dev.create_ah           = mlx5_ib_create_ah;
3370         dev->ib_dev.query_ah            = mlx5_ib_query_ah;
3371         dev->ib_dev.destroy_ah          = mlx5_ib_destroy_ah;
3372         dev->ib_dev.create_srq          = mlx5_ib_create_srq;
3373         dev->ib_dev.modify_srq          = mlx5_ib_modify_srq;
3374         dev->ib_dev.query_srq           = mlx5_ib_query_srq;
3375         dev->ib_dev.destroy_srq         = mlx5_ib_destroy_srq;
3376         dev->ib_dev.post_srq_recv       = mlx5_ib_post_srq_recv;
3377         dev->ib_dev.create_qp           = mlx5_ib_create_qp;
3378         dev->ib_dev.modify_qp           = mlx5_ib_modify_qp;
3379         dev->ib_dev.query_qp            = mlx5_ib_query_qp;
3380         dev->ib_dev.destroy_qp          = mlx5_ib_destroy_qp;
3381         dev->ib_dev.post_send           = mlx5_ib_post_send;
3382         dev->ib_dev.post_recv           = mlx5_ib_post_recv;
3383         dev->ib_dev.create_cq           = mlx5_ib_create_cq;
3384         dev->ib_dev.modify_cq           = mlx5_ib_modify_cq;
3385         dev->ib_dev.resize_cq           = mlx5_ib_resize_cq;
3386         dev->ib_dev.destroy_cq          = mlx5_ib_destroy_cq;
3387         dev->ib_dev.poll_cq             = mlx5_ib_poll_cq;
3388         dev->ib_dev.req_notify_cq       = mlx5_ib_arm_cq;
3389         dev->ib_dev.get_dma_mr          = mlx5_ib_get_dma_mr;
3390         dev->ib_dev.reg_user_mr         = mlx5_ib_reg_user_mr;
3391         dev->ib_dev.rereg_user_mr       = mlx5_ib_rereg_user_mr;
3392         dev->ib_dev.dereg_mr            = mlx5_ib_dereg_mr;
3393         dev->ib_dev.attach_mcast        = mlx5_ib_mcg_attach;
3394         dev->ib_dev.detach_mcast        = mlx5_ib_mcg_detach;
3395         dev->ib_dev.process_mad         = mlx5_ib_process_mad;
3396         dev->ib_dev.alloc_mr            = mlx5_ib_alloc_mr;
3397         dev->ib_dev.map_mr_sg           = mlx5_ib_map_mr_sg;
3398         dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
3399         dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
3400         dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
3401         if (mlx5_core_is_pf(mdev)) {
3402                 dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
3403                 dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
3404                 dev->ib_dev.get_vf_stats        = mlx5_ib_get_vf_stats;
3405                 dev->ib_dev.set_vf_guid         = mlx5_ib_set_vf_guid;
3406         }
3407
3408         dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
3409
3410         mlx5_ib_internal_fill_odp_caps(dev);
3411
3412         if (MLX5_CAP_GEN(mdev, imaicl)) {
3413                 dev->ib_dev.alloc_mw            = mlx5_ib_alloc_mw;
3414                 dev->ib_dev.dealloc_mw          = mlx5_ib_dealloc_mw;
3415                 dev->ib_dev.uverbs_cmd_mask |=
3416                         (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
3417                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
3418         }
3419
3420         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
3421                 dev->ib_dev.get_hw_stats        = mlx5_ib_get_hw_stats;
3422                 dev->ib_dev.alloc_hw_stats      = mlx5_ib_alloc_hw_stats;
3423         }
3424
3425         if (MLX5_CAP_GEN(mdev, xrc)) {
3426                 dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
3427                 dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
3428                 dev->ib_dev.uverbs_cmd_mask |=
3429                         (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
3430                         (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
3431         }
3432
3433         if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
3434             IB_LINK_LAYER_ETHERNET) {
3435                 dev->ib_dev.create_flow = mlx5_ib_create_flow;
3436                 dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
3437                 dev->ib_dev.create_wq    = mlx5_ib_create_wq;
3438                 dev->ib_dev.modify_wq    = mlx5_ib_modify_wq;
3439                 dev->ib_dev.destroy_wq   = mlx5_ib_destroy_wq;
3440                 dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
3441                 dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
3442                 dev->ib_dev.uverbs_ex_cmd_mask |=
3443                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
3444                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
3445                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
3446                         (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
3447                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
3448                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
3449                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
3450         }
3451         err = init_node_data(dev);
3452         if (err)
3453                 goto err_free_port;
3454
3455         mutex_init(&dev->flow_db.lock);
3456         mutex_init(&dev->cap_mask_mutex);
3457         INIT_LIST_HEAD(&dev->qp_list);
3458         spin_lock_init(&dev->reset_flow_resource_lock);
3459
3460         if (ll == IB_LINK_LAYER_ETHERNET) {
3461                 err = mlx5_enable_eth(dev);
3462                 if (err)
3463                         goto err_free_port;
3464         }
3465
3466         err = create_dev_resources(&dev->devr);
3467         if (err)
3468                 goto err_disable_eth;
3469
3470         err = mlx5_ib_odp_init_one(dev);
3471         if (err)
3472                 goto err_rsrc;
3473
3474         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
3475                 err = mlx5_ib_alloc_q_counters(dev);
3476                 if (err)
3477                         goto err_odp;
3478         }
3479
3480         dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
3481         if (!dev->mdev->priv.uar)
3482                 goto err_q_cnt;
3483
3484         err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
3485         if (err)
3486                 goto err_uar_page;
3487
3488         err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
3489         if (err)
3490                 goto err_bfreg;
3491
3492         err = ib_register_device(&dev->ib_dev, NULL);
3493         if (err)
3494                 goto err_fp_bfreg;
3495
3496         err = create_umr_res(dev);
3497         if (err)
3498                 goto err_dev;
3499
3500         for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
3501                 err = device_create_file(&dev->ib_dev.dev,
3502                                          mlx5_class_attributes[i]);
3503                 if (err)
3504                         goto err_umrc;
3505         }
3506
3507         dev->ib_active = true;
3508
3509         return dev;
3510
3511 err_umrc:
3512         destroy_umrc_res(dev);
3513
3514 err_dev:
3515         ib_unregister_device(&dev->ib_dev);
3516
3517 err_fp_bfreg:
3518         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3519
3520 err_bfreg:
3521         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3522
3523 err_uar_page:
3524         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
3525
3526 err_q_cnt:
3527         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
3528                 mlx5_ib_dealloc_q_counters(dev);
3529
3530 err_odp:
3531         mlx5_ib_odp_remove_one(dev);
3532
3533 err_rsrc:
3534         destroy_dev_resources(&dev->devr);
3535
3536 err_disable_eth:
3537         if (ll == IB_LINK_LAYER_ETHERNET) {
3538                 mlx5_disable_eth(dev);
3539                 mlx5_remove_netdev_notifier(dev);
3540         }
3541
3542 err_free_port:
3543         kfree(dev->port);
3544
3545 err_dealloc:
3546         ib_dealloc_device((struct ib_device *)dev);
3547
3548         return NULL;
3549 }
3550
3551 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
3552 {
3553         struct mlx5_ib_dev *dev = context;
3554         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
3555
3556         mlx5_remove_netdev_notifier(dev);
3557         ib_unregister_device(&dev->ib_dev);
3558         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3559         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3560         mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
3561         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
3562                 mlx5_ib_dealloc_q_counters(dev);
3563         destroy_umrc_res(dev);
3564         mlx5_ib_odp_remove_one(dev);
3565         destroy_dev_resources(&dev->devr);
3566         if (ll == IB_LINK_LAYER_ETHERNET)
3567                 mlx5_disable_eth(dev);
3568         kfree(dev->port);
3569         ib_dealloc_device(&dev->ib_dev);
3570 }
3571
3572 static struct mlx5_interface mlx5_ib_interface = {
3573         .add            = mlx5_ib_add,
3574         .remove         = mlx5_ib_remove,
3575         .event          = mlx5_ib_event,
3576 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
3577         .pfault         = mlx5_ib_pfault,
3578 #endif
3579         .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
3580 };
3581
3582 static int __init mlx5_ib_init(void)
3583 {
3584         int err;
3585
3586         mlx5_ib_odp_init();
3587
3588         err = mlx5_register_interface(&mlx5_ib_interface);
3589
3590         return err;
3591 }
3592
3593 static void __exit mlx5_ib_cleanup(void)
3594 {
3595         mlx5_unregister_interface(&mlx5_ib_interface);
3596 }
3597
3598 module_init(mlx5_ib_init);
3599 module_exit(mlx5_ib_cleanup);