IB/mlx5: Enable IPoIB acceleration
[linux-2.6-block.git] / drivers / infiniband / hw / mlx5 / main.c
index 4dc0a8785fe0d250e1b8217c79d38915ae388ef7..d45772da09635c2164f4cef8bcf5255c17fe8cff 100644 (file)
@@ -57,6 +57,7 @@
 #include <linux/mlx5/fs.h>
 #include <linux/mlx5/vport.h>
 #include "mlx5_ib.h"
+#include "cmd.h"
 
 #define DRIVER_NAME "mlx5_ib"
 #define DRIVER_VERSION "2.2-1"
@@ -165,15 +166,82 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
        return ndev;
 }
 
-static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
-                               struct ib_port_attr *props)
+static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+                                   u8 *active_width)
+{
+       switch (eth_proto_oper) {
+       case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
+       case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
+       case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
+       case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_SDR;
+               break;
+       case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
+       case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
+       case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
+       case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
+       case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
+       case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
+       case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_QDR;
+               break;
+       case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
+       case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
+       case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_EDR;
+               break;
+       case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
+       case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
+       case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
+       case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_QDR;
+               break;
+       case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
+       case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
+       case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_HDR;
+               break;
+       case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_FDR;
+               break;
+       case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
+       case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
+       case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
+       case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_EDR;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void mlx5_query_port_roce(struct ib_device *device, u8 port_num,
+                                struct ib_port_attr *props)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
+       struct mlx5_core_dev *mdev = dev->mdev;
        struct net_device *ndev, *upper;
        enum ib_mtu ndev_ib_mtu;
        u16 qkey_viol_cntr;
+       u32 eth_prot_oper;
 
-       /* props being zeroed by the caller, avoid zeroing it here */
+       /* Possible bad flows are checked before filling out props so in case
+        * of an error it will still be zeroed out.
+        */
+       if (mlx5_query_port_eth_proto_oper(mdev, &eth_prot_oper, port_num))
+               return;
+
+       translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
+                                &props->active_width);
 
        props->port_cap_flags  |= IB_PORT_CM_SUP;
        props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
@@ -191,7 +259,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 
        ndev = mlx5_ib_get_netdev(device, port_num);
        if (!ndev)
-               return 0;
+               return;
 
        if (mlx5_lag_is_active(dev->mdev)) {
                rcu_read_lock();
@@ -214,11 +282,6 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
        dev_put(ndev);
 
        props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
-
-       props->active_width     = IB_WIDTH_4X;  /* TODO */
-       props->active_speed     = IB_SPEED_QDR; /* TODO */
-
-       return 0;
 }
 
 static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
@@ -924,7 +987,8 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
                return mlx5_query_hca_port(ibdev, port, props);
 
        case MLX5_VPORT_ACCESS_METHOD_NIC:
-               return mlx5_query_port_roce(ibdev, port, props);
+               mlx5_query_port_roce(ibdev, port, props);
+               return 0;
 
        default:
                return -EINVAL;
@@ -1478,7 +1542,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
        /* need to protect from a race on closing the vma as part of
         * mlx5_ib_vma_close.
         */
-       down_read(&owning_mm->mmap_sem);
+       down_write(&owning_mm->mmap_sem);
        list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
                                 list) {
                vma = vma_private->vma;
@@ -1488,11 +1552,12 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
                /* context going to be destroyed, should
                 * not access ops any more.
                 */
+               vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
                vma->vm_ops = NULL;
                list_del(&vma_private->list);
                kfree(vma_private);
        }
-       up_read(&owning_mm->mmap_sem);
+       up_write(&owning_mm->mmap_sem);
        mmput(owning_mm);
        put_task_struct(owning_process);
 }
@@ -1726,6 +1791,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
 #define LAST_TCP_UDP_FIELD src_port
 #define LAST_TUNNEL_FIELD tunnel_id
 #define LAST_FLOW_TAG_FIELD tag_id
+#define LAST_DROP_FIELD size
 
 /* Field is the last supported field */
 #define FIELDS_NOT_SUPPORTED(filter, field)\
@@ -1735,8 +1801,11 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
                   offsetof(typeof(filter), field) -\
                   sizeof(filter.field))
 
-static int parse_flow_attr(u32 *match_c, u32 *match_v,
-                          const union ib_flow_spec *ib_spec, u32 *tag_id)
+#define IPV4_VERSION 4
+#define IPV6_VERSION 6
+static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
+                          u32 *match_v, const union ib_flow_spec *ib_spec,
+                          u32 *tag_id, bool *is_drop)
 {
        void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
                                           misc_parameters);
@@ -1744,17 +1813,22 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
                                           misc_parameters);
        void *headers_c;
        void *headers_v;
+       int match_ipv;
 
        if (ib_spec->type & IB_FLOW_SPEC_INNER) {
                headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
                                         inner_headers);
                headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
                                         inner_headers);
+               match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+                                       ft_field_support.inner_ip_version);
        } else {
                headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
                                         outer_headers);
                headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
                                         outer_headers);
+               match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+                                       ft_field_support.outer_ip_version);
        }
 
        switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
@@ -1810,10 +1884,17 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
                        return -EOPNOTSUPP;
 
-               MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-                        ethertype, 0xffff);
-               MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-                        ethertype, ETH_P_IP);
+               if (match_ipv) {
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+                                ip_version, 0xf);
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+                                ip_version, IPV4_VERSION);
+               } else {
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+                                ethertype, 0xffff);
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+                                ethertype, ETH_P_IP);
+               }
 
                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
                                    src_ipv4_src_ipv6.ipv4_layout.ipv4),
@@ -1842,10 +1923,17 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
                        return -EOPNOTSUPP;
 
-               MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-                        ethertype, 0xffff);
-               MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-                        ethertype, ETH_P_IPV6);
+               if (match_ipv) {
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+                                ip_version, 0xf);
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+                                ip_version, IPV6_VERSION);
+               } else {
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+                                ethertype, 0xffff);
+                       MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+                                ethertype, ETH_P_IPV6);
+               }
 
                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
                                    src_ipv4_src_ipv6.ipv6_layout.ipv6),
@@ -1937,6 +2025,12 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
 
                *tag_id = ib_spec->flow_tag.tag_id;
                break;
+       case IB_FLOW_SPEC_ACTION_DROP:
+               if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
+                                        LAST_DROP_FIELD))
+                       return -EOPNOTSUPP;
+               *is_drop = true;
+               break;
        default:
                return -EINVAL;
        }
@@ -1967,26 +2061,60 @@ static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
               is_multicast_ether_addr(eth_spec->val.dst_mac);
 }
 
-static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
+static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
+                              const struct ib_flow_attr *flow_attr,
+                              bool check_inner)
 {
        union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
-       bool has_ipv4_spec = false;
-       bool eth_type_ipv4 = true;
+       int match_ipv = check_inner ?
+                       MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+                                       ft_field_support.inner_ip_version) :
+                       MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+                                       ft_field_support.outer_ip_version);
+       int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
+       bool ipv4_spec_valid, ipv6_spec_valid;
+       unsigned int ip_spec_type = 0;
+       bool has_ethertype = false;
        unsigned int spec_index;
+       bool mask_valid = true;
+       u16 eth_type = 0;
+       bool type_valid;
 
        /* Validate that ethertype is correct */
        for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-               if (ib_spec->type == IB_FLOW_SPEC_ETH &&
+               if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
                    ib_spec->eth.mask.ether_type) {
-                       if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
-                             ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
-                               eth_type_ipv4 = false;
-               } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
-                       has_ipv4_spec = true;
+                       mask_valid = (ib_spec->eth.mask.ether_type ==
+                                     htons(0xffff));
+                       has_ethertype = true;
+                       eth_type = ntohs(ib_spec->eth.val.ether_type);
+               } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
+                          (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
+                       ip_spec_type = ib_spec->type;
                }
                ib_spec = (void *)ib_spec + ib_spec->size;
        }
-       return !has_ipv4_spec || eth_type_ipv4;
+
+       type_valid = (!has_ethertype) || (!ip_spec_type);
+       if (!type_valid && mask_valid) {
+               ipv4_spec_valid = (eth_type == ETH_P_IP) &&
+                       (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
+               ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
+                       (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
+
+               type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
+                            (((eth_type == ETH_P_MPLS_UC) ||
+                              (eth_type == ETH_P_MPLS_MC)) && match_ipv);
+       }
+
+       return type_valid;
+}
+
+static bool is_valid_attr(struct mlx5_core_dev *mdev,
+                         const struct ib_flow_attr *flow_attr)
+{
+       return is_valid_ethertype(mdev, flow_attr, false) &&
+              is_valid_ethertype(mdev, flow_attr, true);
 }
 
 static void put_flow_table(struct mlx5_ib_dev *dev,
@@ -2038,8 +2166,8 @@ enum flow_table_type {
        MLX5_IB_FT_TX
 };
 
-#define MLX5_FS_MAX_TYPES       10
-#define MLX5_FS_MAX_ENTRIES     32000UL
+#define MLX5_FS_MAX_TYPES       6
+#define MLX5_FS_MAX_ENTRIES     BIT(16)
 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
                                                struct ib_flow_attr *flow_attr,
                                                enum flow_table_type ft_type)
@@ -2048,11 +2176,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
        struct mlx5_flow_namespace *ns = NULL;
        struct mlx5_ib_flow_prio *prio;
        struct mlx5_flow_table *ft;
+       int max_table_size;
        int num_entries;
        int num_groups;
        int priority;
        int err = 0;
 
+       max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+                                                      log_max_ft_size));
        if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
                if (flow_is_multicast_only(flow_attr) &&
                    !dont_trap)
@@ -2091,6 +2222,9 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
        if (!ns)
                return ERR_PTR(-ENOTSUPP);
 
+       if (num_entries > max_table_size)
+               return ERR_PTR(-ENOMEM);
+
        ft = prio->flow_table;
        if (!ft) {
                ft = mlx5_create_auto_grouped_flow_table(ns, priority,
@@ -2118,12 +2252,15 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
        struct mlx5_ib_flow_handler *handler;
        struct mlx5_flow_act flow_act = {0};
        struct mlx5_flow_spec *spec;
+       struct mlx5_flow_destination *rule_dst = dst;
        const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
        unsigned int spec_index;
        u32 flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+       bool is_drop = false;
        int err = 0;
+       int dest_num = 1;
 
-       if (!is_valid_attr(flow_attr))
+       if (!is_valid_attr(dev->mdev, flow_attr))
                return ERR_PTR(-EINVAL);
 
        spec = mlx5_vzalloc(sizeof(*spec));
@@ -2136,8 +2273,9 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
        INIT_LIST_HEAD(&handler->list);
 
        for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-               err = parse_flow_attr(spec->match_criteria,
-                                     spec->match_value, ib_flow, &flow_tag);
+               err = parse_flow_attr(dev->mdev, spec->match_criteria,
+                                     spec->match_value,
+                                     ib_flow, &flow_tag, &is_drop);
                if (err < 0)
                        goto free;
 
@@ -2145,8 +2283,14 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
        }
 
        spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
-       flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
-               MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+       if (is_drop) {
+               flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+               rule_dst = NULL;
+               dest_num = 0;
+       } else {
+               flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
+                   MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+       }
 
        if (flow_tag != MLX5_FS_DEFAULT_FLOW_TAG &&
            (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
@@ -2159,7 +2303,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
        flow_act.flow_tag = flow_tag;
        handler->rule = mlx5_add_flow_rules(ft, spec,
                                            &flow_act,
-                                           dst, 1);
+                                           rule_dst, dest_num);
 
        if (IS_ERR(handler->rule)) {
                err = PTR_ERR(handler->rule);
@@ -2315,7 +2459,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
        int err;
 
        if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
-               return ERR_PTR(-ENOSPC);
+               return ERR_PTR(-ENOMEM);
 
        if (domain != IB_FLOW_DOMAIN_USER ||
            flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
@@ -3133,7 +3277,7 @@ static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
                mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
-struct mlx5_ib_q_counter {
+struct mlx5_ib_counter {
        const char *name;
        size_t offset;
 };
@@ -3141,18 +3285,18 @@ struct mlx5_ib_q_counter {
 #define INIT_Q_COUNTER(_name)          \
        { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
 
-static const struct mlx5_ib_q_counter basic_q_cnts[] = {
+static const struct mlx5_ib_counter basic_q_cnts[] = {
        INIT_Q_COUNTER(rx_write_requests),
        INIT_Q_COUNTER(rx_read_requests),
        INIT_Q_COUNTER(rx_atomic_requests),
        INIT_Q_COUNTER(out_of_buffer),
 };
 
-static const struct mlx5_ib_q_counter out_of_seq_q_cnts[] = {
+static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
        INIT_Q_COUNTER(out_of_sequence),
 };
 
-static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
+static const struct mlx5_ib_counter retrans_q_cnts[] = {
        INIT_Q_COUNTER(duplicate_request),
        INIT_Q_COUNTER(rnr_nak_retry_err),
        INIT_Q_COUNTER(packet_seq_err),
@@ -3160,22 +3304,31 @@ static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
        INIT_Q_COUNTER(local_ack_timeout_err),
 };
 
-static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
+#define INIT_CONG_COUNTER(_name)               \
+       { .name = #_name, .offset =     \
+               MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
+
+static const struct mlx5_ib_counter cong_cnts[] = {
+       INIT_CONG_COUNTER(rp_cnp_ignored),
+       INIT_CONG_COUNTER(rp_cnp_handled),
+       INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
+       INIT_CONG_COUNTER(np_cnp_sent),
+};
+
+static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 {
        unsigned int i;
 
        for (i = 0; i < dev->num_ports; i++) {
                mlx5_core_dealloc_q_counter(dev->mdev,
-                                           dev->port[i].q_cnts.set_id);
-               kfree(dev->port[i].q_cnts.names);
-               kfree(dev->port[i].q_cnts.offsets);
+                                           dev->port[i].cnts.set_id);
+               kfree(dev->port[i].cnts.names);
+               kfree(dev->port[i].cnts.offsets);
        }
 }
 
-static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
-                                     const char ***names,
-                                     size_t **offsets,
-                                     u32 *num)
+static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
+                                   struct mlx5_ib_counters *cnts)
 {
        u32 num_counters;
 
@@ -3186,27 +3339,32 @@ static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
 
        if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
                num_counters += ARRAY_SIZE(retrans_q_cnts);
+       cnts->num_q_counters = num_counters;
 
-       *names = kcalloc(num_counters, sizeof(**names), GFP_KERNEL);
-       if (!*names)
+       if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+               cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
+               num_counters += ARRAY_SIZE(cong_cnts);
+       }
+
+       cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
+       if (!cnts->names)
                return -ENOMEM;
 
-       *offsets = kcalloc(num_counters, sizeof(**offsets), GFP_KERNEL);
-       if (!*offsets)
+       cnts->offsets = kcalloc(num_counters,
+                               sizeof(cnts->offsets), GFP_KERNEL);
+       if (!cnts->offsets)
                goto err_names;
 
-       *num = num_counters;
-
        return 0;
 
 err_names:
-       kfree(*names);
+       kfree(cnts->names);
        return -ENOMEM;
 }
 
-static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
-                                   const char **names,
-                                   size_t *offsets)
+static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
+                                 const char **names,
+                                 size_t *offsets)
 {
        int i;
        int j = 0;
@@ -3229,9 +3387,16 @@ static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
                        offsets[j] = retrans_q_cnts[i].offset;
                }
        }
+
+       if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+               for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
+                       names[j] = cong_cnts[i].name;
+                       offsets[j] = cong_cnts[i].offset;
+               }
+       }
 }
 
-static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
+static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
 {
        int i;
        int ret;
@@ -3240,7 +3405,7 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
                struct mlx5_ib_port *port = &dev->port[i];
 
                ret = mlx5_core_alloc_q_counter(dev->mdev,
-                                               &port->q_cnts.set_id);
+                                               &port->cnts.set_id);
                if (ret) {
                        mlx5_ib_warn(dev,
                                     "couldn't allocate queue counter for port %d, err %d\n",
@@ -3248,15 +3413,12 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
                        goto dealloc_counters;
                }
 
-               ret = __mlx5_ib_alloc_q_counters(dev,
-                                                &port->q_cnts.names,
-                                                &port->q_cnts.offsets,
-                                                &port->q_cnts.num_counters);
+               ret = __mlx5_ib_alloc_counters(dev, &port->cnts);
                if (ret)
                        goto dealloc_counters;
 
-               mlx5_ib_fill_q_counters(dev, port->q_cnts.names,
-                                       port->q_cnts.offsets);
+               mlx5_ib_fill_counters(dev, port->cnts.names,
+                                     port->cnts.offsets);
        }
 
        return 0;
@@ -3264,7 +3426,7 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
 dealloc_counters:
        while (--i >= 0)
                mlx5_core_dealloc_q_counter(dev->mdev,
-                                           dev->port[i].q_cnts.set_id);
+                                           dev->port[i].cnts.set_id);
 
        return ret;
 }
@@ -3279,44 +3441,113 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
        if (port_num == 0)
                return NULL;
 
-       return rdma_alloc_hw_stats_struct(port->q_cnts.names,
-                                         port->q_cnts.num_counters,
+       return rdma_alloc_hw_stats_struct(port->cnts.names,
+                                         port->cnts.num_q_counters +
+                                         port->cnts.num_cong_counters,
                                          RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 
-static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
-                               struct rdma_hw_stats *stats,
-                               u8 port_num, int index)
+static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev,
+                                   struct mlx5_ib_port *port,
+                                   struct rdma_hw_stats *stats)
 {
-       struct mlx5_ib_dev *dev = to_mdev(ibdev);
-       struct mlx5_ib_port *port = &dev->port[port_num - 1];
        int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
        void *out;
        __be32 val;
-       int ret;
-       int i;
-
-       if (!stats)
-               return -ENOSYS;
+       int ret, i;
 
        out = mlx5_vzalloc(outlen);
        if (!out)
                return -ENOMEM;
 
        ret = mlx5_core_query_q_counter(dev->mdev,
-                                       port->q_cnts.set_id, 0,
+                                       port->cnts.set_id, 0,
                                        out, outlen);
        if (ret)
                goto free;
 
-       for (i = 0; i < port->q_cnts.num_counters; i++) {
-               val = *(__be32 *)(out + port->q_cnts.offsets[i]);
+       for (i = 0; i < port->cnts.num_q_counters; i++) {
+               val = *(__be32 *)(out + port->cnts.offsets[i]);
                stats->value[i] = (u64)be32_to_cpu(val);
        }
 
 free:
        kvfree(out);
-       return port->q_cnts.num_counters;
+       return ret;
+}
+
+static int mlx5_ib_query_cong_counters(struct mlx5_ib_dev *dev,
+                                      struct mlx5_ib_port *port,
+                                      struct rdma_hw_stats *stats)
+{
+       int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
+       void *out;
+       int ret, i;
+       int offset = port->cnts.num_q_counters;
+
+       out = mlx5_vzalloc(outlen);
+       if (!out)
+               return -ENOMEM;
+
+       ret = mlx5_cmd_query_cong_counter(dev->mdev, false, out, outlen);
+       if (ret)
+               goto free;
+
+       for (i = 0; i < port->cnts.num_cong_counters; i++) {
+               stats->value[i + offset] =
+                       be64_to_cpup((__be64 *)(out +
+                                    port->cnts.offsets[i + offset]));
+       }
+
+free:
+       kvfree(out);
+       return ret;
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+                               struct rdma_hw_stats *stats,
+                               u8 port_num, int index)
+{
+       struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       struct mlx5_ib_port *port = &dev->port[port_num - 1];
+       int ret, num_counters;
+
+       if (!stats)
+               return -EINVAL;
+
+       ret = mlx5_ib_query_q_counters(dev, port, stats);
+       if (ret)
+               return ret;
+       num_counters = port->cnts.num_q_counters;
+
+       if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+               ret = mlx5_ib_query_cong_counters(dev, port, stats);
+               if (ret)
+                       return ret;
+               num_counters += port->cnts.num_cong_counters;
+       }
+
+       return num_counters;
+}
+
+static struct net_device*
+mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
+                         u8 port_num,
+                         enum rdma_netdev_t type,
+                         const char *name,
+                         unsigned char name_assign_type,
+                         void (*setup)(struct net_device *))
+{
+       if (type != RDMA_NETDEV_IPOIB)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       return mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca,
+                                     name, setup);
+}
+
+static void mlx5_ib_free_rdma_netdev(struct net_device *netdev)
+{
+       return mlx5_rdma_netdev_free(netdev);
 }
 
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
@@ -3449,6 +3680,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
        dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
        dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
+       dev->ib_dev.alloc_rdma_netdev   = mlx5_ib_alloc_rdma_netdev;
+       dev->ib_dev.free_rdma_netdev    = mlx5_ib_free_rdma_netdev;
        if (mlx5_core_is_pf(mdev)) {
                dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
                dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
@@ -3523,14 +3756,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                goto err_rsrc;
 
        if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
-               err = mlx5_ib_alloc_q_counters(dev);
+               err = mlx5_ib_alloc_counters(dev);
                if (err)
                        goto err_odp;
        }
 
        dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
        if (!dev->mdev->priv.uar)
-               goto err_q_cnt;
+               goto err_cnt;
 
        err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
        if (err)
@@ -3574,9 +3807,9 @@ err_bfreg:
 err_uar_page:
        mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 
-err_q_cnt:
+err_cnt:
        if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-               mlx5_ib_dealloc_q_counters(dev);
+               mlx5_ib_dealloc_counters(dev);
 
 err_odp:
        mlx5_ib_odp_remove_one(dev);
@@ -3610,7 +3843,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
        mlx5_free_bfreg(dev->mdev, &dev->bfreg);
        mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
        if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-               mlx5_ib_dealloc_q_counters(dev);
+               mlx5_ib_dealloc_counters(dev);
        destroy_umrc_res(dev);
        mlx5_ib_odp_remove_one(dev);
        destroy_dev_resources(&dev->devr);